<a href="https://colab.research.google.com/github/oluwafemidiakhoa/MLprject/blob/main/AIReseacher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI, RateLimitError
from kaggle.api.kaggle_api_extended import KaggleApi
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.stats import linregress, ttest_ind

# Set environment variables and initialize API clients
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Base class for agents with OpenAI and Kaggle functionalities
class Agent:
    def __init__(self, name, role):
        self.name = name
        self.role = role
        self.api_client = client
        self.kaggle_api = KaggleApi()
        self.kaggle_api.authenticate()

    def query_openai(self, prompt, retries=5, initial_delay=2):
        for attempt in range(retries):
            try:
                response = self.api_client.chat.completions.create(
                    model="gpt-4",
                    messages=[{"role": "user", "content": prompt}]
                )
                return response.choices[0].message.content
            except RateLimitError:
                if attempt < retries - 1:
                    delay = initial_delay * (2 ** attempt)
                    print(f"Rate limit exceeded. Retrying in {delay} seconds...")
                    time.sleep(delay)
                else:
                    raise

    def retrieve_kaggle_data(self, dataset, file_name):
        print("\n=== Downloading Dataset from Kaggle ===")
        self.kaggle_api.dataset_download_files(dataset, path=".", unzip=True)
        print(f"Downloaded and unzipped dataset: {dataset}.zip")
        return pd.read_csv(file_name)

# Specialized agents for different research tasks
class DataAnalysisAgent(Agent):
    def plot_time_series(self, df):
        df_yearly = df['AverageTemperature'].resample('Y').mean()
        plt.figure(figsize=(14, 7))
        plt.plot(df_yearly.index, df_yearly, label="Yearly Avg Temp")
        plt.fill_between(df_yearly.index,
                         df_yearly.rolling(window=5).mean() - 1.96 * df_yearly.rolling(window=5).std(),
                         df_yearly.rolling(window=5).mean() + 1.96 * df_yearly.rolling(window=5).std(),
                         alpha=0.2, label="95% Confidence Interval")
        plt.title("Yearly Global Average Temperature Trend with Confidence Interval")
        plt.xlabel("Year")
        plt.ylabel("Temperature (°C)")
        plt.legend()
        plt.show()

    def spectral_analysis(self, df):
        temperature_data = df['AverageTemperature'].dropna()
        temperature_fft = np.fft.fft(temperature_data)
        freqs = np.fft.fftfreq(len(temperature_data))

        plt.figure(figsize=(12, 6))
        plt.plot(freqs[:len(freqs)//2], np.abs(temperature_fft[:len(freqs)//2]))
        plt.title("Spectral Analysis of Temperature Anomalies")
        plt.xlabel("Frequency")
        plt.ylabel("Amplitude")
        plt.show()

    def rolling_statistics(self, df):
        df_yearly = df['AverageTemperature'].resample('Y').mean()
        plt.figure(figsize=(14, 7))
        plt.plot(df_yearly.index, df_yearly, label="Yearly Avg Temp")
        plt.plot(df_yearly.index, df_yearly.rolling(window=5).mean(), label="5-Year Rolling Mean", linestyle="--")
        plt.plot(df_yearly.index, df_yearly.rolling(window=20).mean(), label="20-Year Rolling Mean", linestyle="--")
        plt.title("5-Year and 20-Year Rolling Mean of Global Temperature")
        plt.xlabel("Year")
        plt.ylabel("Temperature (°C)")
        plt.legend()
        plt.show()

    def compare_forecasting_models(self, df):
        df_yearly = df['AverageTemperature'].resample('Y').mean().dropna()

        arima_model = ARIMA(df_yearly, order=(5, 1, 0))
        arima_fit = arima_model.fit()
        arima_forecast = arima_fit.forecast(steps=20)

        sarima_model = SARIMAX(df_yearly, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
        sarima_fit = sarima_model.fit()
        sarima_forecast = sarima_fit.get_forecast(steps=20).predicted_mean

        plt.figure(figsize=(14, 7))
        plt.plot(df_yearly.index, df_yearly, label="Observed")
        plt.plot(pd.date_range(df_yearly.index[-1], periods=20, freq='Y'), arima_forecast, label="ARIMA Forecast", linestyle="--")
        plt.plot(pd.date_range(df_yearly.index[-1], periods=20, freq='Y'), sarima_forecast, label="SARIMA Forecast", linestyle="--")
        plt.title("ARIMA vs SARIMA Forecasting")
        plt.xlabel("Year")
        plt.ylabel("Temperature (°C)")
        plt.legend()
        plt.show()

    def hypothesis_testing(self, df):
        df_yearly = df['AverageTemperature'].resample('Y').mean().dropna()
        pre_1950 = df_yearly[df_yearly.index.year < 1950]
        post_1950 = df_yearly[df_yearly.index.year >= 1950]

        t_stat, p_value = ttest_ind(pre_1950, post_1950)
        print(f"T-Statistic: {t_stat}, P-Value: {p_value}")
        if p_value < 0.05:
            print("Significant difference between pre-1950 and post-1950 average temperatures.")
        else:
            print("No significant difference detected between pre-1950 and post-1950 temperatures.")

    def analyze_and_learn(self, df):
        trend, intercept, r_value, p_value, std_err = linregress(df.index.year, df['AverageTemperature'])
        summary = f"Slope (°C/Year): {trend:.3f}\nIntercept: {intercept:.3f}°C\nR-squared: {r_value**2:.3f}\nP-value: {p_value:.3e}"
        print("Trend Analysis:\n", summary)
        return summary

# Adding the remaining agents as specified, unchanged
class LiteratureReviewAgent(Agent):
    def conduct_review(self, topic):
        prompt = f"Conduct a literature review on {topic}, summarizing key findings, gaps, and challenges."
        return self.query_openai(prompt)

class HypothesisAgent(Agent):
    def generate_hypotheses(self, literature):
        prompt = f"Generate hypotheses based on this literature review: {literature}"
        return self.query_openai(prompt)

class ExperimentationAgent(Agent):
    def design_experiment(self, hypothesis):
        prompt = f"Design an experiment to test this hypothesis: {hypothesis}"
        return self.query_openai(prompt)

#class PredictiveModelAgent(Agent):
    #def forecast_trends(self, df):
        # Forecasting methods already handled within the DataAnalysisAgent

class DocumentationAgent(Agent):
    def compile_report(self, goal, literature, hypotheses, experiment, analysis_summary):
        report = f"""
        === Final Research Report ===

        Goal:
        {goal}

        Literature Review:
        {literature}

        Hypotheses:
        {hypotheses}

        Experiment Design:
        {experiment}

        Data Analysis Summary:
        {analysis_summary}
        """
        print(report)
        return report

# Project Manager to coordinate tasks
class ProjectManagerAgent(Agent):
    def execute_research_workflow(self, goal, kaggle_dataset=None, file_name=None):
        review_agent = LiteratureReviewAgent("Literature Review Agent", "Conducts literature reviews")
        hypothesis_agent = HypothesisAgent("Hypothesis Agent", "Generates hypotheses")
        experiment_agent = ExperimentationAgent("Experiment Agent", "Designs experiments")
        data_agent = DataRetrievalAgent("Data Agent", "Retrieves and preprocesses data")
        analysis_agent = DataAnalysisAgent("Data Analysis Agent", "Performs data analysis")
        model_agent = PredictiveModelAgent("Predictive Model Agent", "Forecasts trends")
        doc_agent = DocumentationAgent("Documentation Agent", "Compiles final report")

        print("\n=== Starting Data Retrieval ===")
        data = data_agent.retrieve_and_preprocess_data(kaggle_dataset, file_name)

        print("\n=== Literature Review ===")
        literature_review = review_agent.conduct_review(goal)

        print("\n=== Hypothesis Generation ===")
        hypotheses = hypothesis_agent.generate_hypotheses(literature_review)

        print("\n=== Experiment Design ===")
        experiment_design = experiment_agent.design_experiment(hypotheses)

        print("\n=== Data Analysis ===")
        analysis_summary = analysis_agent.analyze_and_learn(data)
        analysis_agent.plot_time_series(data)
        analysis_agent.spectral_analysis(data)
        analysis_agent.rolling_statistics(data)
        analysis_agent.compare_forecasting_models(data)
        analysis_agent.hypothesis_testing(data)

        print("\n=== Documentation ===")
        doc_agent.compile_report(goal, literature_review, hypotheses, experiment_design, analysis_summary)

# Execution setup
goal = "Investigate the impact of climate change on global temperature trends using historical data."
kaggle_dataset = "berkeleyearth/climate-change-earth-surface-temperature-data"
file_name = "GlobalLandTemperaturesByCity.csv"

manager = ProjectManagerAgent("Project Manager", "Orchestrates complex research workflows")
manager.execute_research_workflow(goal, kaggle_dataset, file_name)


In [None]:
import os

# Setting OpenAI API Key directly
os.environ["OPENAI_API_KEY"] = ""  # Replace with your actual API key

# Set Kaggle credentials directly in the script
# (ensure you have a Kaggle account and have generated an API token - kaggle.json)
os.environ["KAGGLE_USERNAME"] = ""  # Replace with your Kaggle username
os.environ["KAGGLE_KEY"] = ""  # Replace with your Kaggle API key


# Check that the variables were set correctly
print("Environment variables set for OpenAI and Kaggle.")