In [None]:
import pandas as pd
from prophet import Prophet
import numpy as np

# Load data
df = pd.read_csv('/content/sample_data.csv')

# Group the DataFrame by 'city_id'
grouped = df.groupby('Product')

# Create a dictionary to hold DataFrames for each city
city_datasets = {city_id: group_df for city_id, group_df in grouped}

# Define a function to fill NaN values using rolling mean
def fill_na_with_rolling_mean(series, window=4):
    return series.fillna(series.rolling(window, min_periods=1).mean())

# Prepare the submission DataFrame
submission_df = pd.DataFrame(columns=['Date', 'LikeCount', 'CommentCount', 'SentimentScore'])
submission_id = 1

for city_id, df in city_datasets.items():
    print(f"Processing Product: {city_id}")

    # Convert date column to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    # Set date as index
    df.set_index('Date', inplace=True)

    # Apply the function to each relevant column
    df['LikeCount'] = fill_na_with_rolling_mean(df['LikeCount'])
    df['CommentCount'] = fill_na_with_rolling_mean(df['CommentCount'])
    df['SentimentScore'] = fill_na_with_rolling_mean(df['SentimentScore'])

    # Ensure no NaN values remain
    df.fillna(0, inplace=True)

    # Reset the index to get back the date column
    df.reset_index(inplace=True)

    # Initialize the new DataFrame for predictions
    future_dates = pd.date_range(start='2019-01-01', periods=7, freq='D')
    new_df = pd.DataFrame({'Date': future_dates, 'Product': city_id})

    # Columns to forecast
    columns_to_forecast = ['SentimentScore']

    # Train and predict for each column
    for column in columns_to_forecast:
        print(f"Training model for {column} for city {city_id}")
        temp_df = df[['Date', column, 'LikeCount', 'CommentCount']].rename(columns={'date': 'ds', column: 'y'})

        # Train the model
        m = Prophet()
        m.add_regressor('LikeCount')
        m.add_regressor('CommentCount')
        m.add_regressor('SentimentScore')

        m.fit(temp_df)

        # Create future dataframe with additional regressors
        future = m.make_future_dataframe(periods=3)
        future = future.merge(df[['Date','LikeCount', 'CommentCount', 'SentimentScore']], left_on='ds', right_on='Date', how='left')
        future.drop(columns=['Date'], inplace=True)

        # Fill NaN values in future dataframe
        future['LikeCount'].fillna(future['LikeCount'].mean(), inplace=True)
        future['CommentCount'].fillna(future['CommentCount'].mean(), inplace=True)
        future['SentimentScore'].fillna(future['SentimentScore'].mean(), inplace=True)

        # Make future predictions
        forecast = m.predict(future)

        # Extract the predictions for the next 7 days
        forecast = forecast[['ds', 'yhat']].tail(7).reset_index(drop=True)
        new_df[column] = forecast['yhat']

    # Combine the original and new DataFrames
    df['Product'] = city_id
    df = df.rename(columns={'Date': 'ds'})

    # Reorder columns to match the new DataFrame format
    df = df[['ds', 'Product', 'LikeCount', 'CommentCount', 'SentimentScore']]

    # Concatenate the original and new DataFrames
    final_df = pd.concat([df, new_df.rename(columns={'Date': 'ds'})], ignore_index=True)

    # Ensure no NaN values remain
    final_df.fillna(0, inplace=True)
    # Reset the index to get back the date column
    final_df.reset_index(inplace=True)

    final_df = final_df[['ds', 'Product', 'LikeCount', 'CommentCount', 'SentimentScore']]
    final_df.columns = ['ds', 'y', 'x1', 'x2', 'x3']

    train = final_df.iloc[:len(final_df) - 7]
    test = final_df.iloc[len(final_df) - 7:]

    m = Prophet()

    m.add_regressor('x1')
    m.add_regressor('x2')
    m.add_regressor('x3')

    m.fit(train)
    future = m.make_future_dataframe(periods=7)
    future['x1'] = final_df['x1']
    future['x2'] = final_df['x2']
    future['x3'] = final_df['x3']

    forecast = m.predict(future)
    forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(7)

    # Add predicted values to submission DataFrame
    submission_data = {
        'submission_ID': range(submission_id, submission_id + 7),
        'city_id': city_id,
        'date': future_dates,
        'avg_temp_c': new_df['avg_temp_c']
    }
    submission_id += 7
    submission_df = pd.concat([submission_df, pd.DataFrame(submission_data)], ignore_index=True)

# Save to CSV
submission_df.to_csv('submission2.csv', index=False)

print("Forecasting and submission file generation complete.")


In [None]:
import pandas as pd
from prophet import Prophet
import numpy as np

# Load data
df = pd.read_csv('/content/sample_data.csv')

# Group the DataFrame by 'city_id'
grouped = df.groupby('Product')

# Create a dictionary to hold DataFrames for each city
city_datasets = {city_id: group_df for city_id, group_df in grouped}

city_datasets

{'ChatGPT':         Date  LikeCount  CommentCount  SentimentScore  Product
 1   8/1/2023         15             9             0.9  ChatGPT
 6   8/2/2023         22            12             0.9  ChatGPT
 11  8/3/2023         15            10             0.9  ChatGPT
 16  8/4/2023         14            12             0.7  ChatGPT
 21  8/5/2023         16             9             0.9  ChatGPT
 26  8/6/2023         17             9             0.6  ChatGPT
 31  8/7/2023         15             8             0.9  ChatGPT
 36  8/8/2023         18             8             0.8  ChatGPT,
 'Gemini':         Date  LikeCount  CommentCount  SentimentScore Product
 2   8/1/2023         12             8             0.7  Gemini
 7   8/2/2023         17             7             0.7  Gemini
 12  8/3/2023         16             7             0.7  Gemini
 17  8/4/2023         15             8             0.7  Gemini
 22  8/5/2023         18             4             0.5  Gemini
 27  8/6/2023         16

In [None]:
# Define a function to fill NaN values using rolling mean
def fill_na_with_rolling_mean(series, window=4):
    return series.fillna(series.rolling(window, min_periods=1).mean())

# Prepare the submission DataFrame
# submission_df = pd.DataFrame(columns=['Date', 'LikeCount', 'CommentCount', 'SentimentScore'])
# submission_id = 1

In [None]:
df = city_datasets["ChatGPT"]
city_id = "ChatGPT"
df

Unnamed: 0,Date,LikeCount,CommentCount,SentimentScore,Product
1,8/1/2023,15,9,0.9,ChatGPT
6,8/2/2023,22,12,0.9,ChatGPT
11,8/3/2023,15,10,0.9,ChatGPT
16,8/4/2023,14,12,0.7,ChatGPT
21,8/5/2023,16,9,0.9,ChatGPT
26,8/6/2023,17,9,0.6,ChatGPT
31,8/7/2023,15,8,0.9,ChatGPT
36,8/8/2023,18,8,0.8,ChatGPT


In [None]:
# Convert date column to datetime if 'Date' is still a column
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df["Date"])
    # Set date as index
    df.set_index('Date', inplace=True)
else:
    print("Date is already set as index")
df

Unnamed: 0_level_0,LikeCount,CommentCount,SentimentScore,Product
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-08-01,15,9,0.9,ChatGPT
2023-08-02,22,12,0.9,ChatGPT
2023-08-03,15,10,0.9,ChatGPT
2023-08-04,14,12,0.7,ChatGPT
2023-08-05,16,9,0.9,ChatGPT
2023-08-06,17,9,0.6,ChatGPT
2023-08-07,15,8,0.9,ChatGPT
2023-08-08,18,8,0.8,ChatGPT


In [None]:
# Apply the function to each relevant column
df['LikeCount'] = fill_na_with_rolling_mean(df['LikeCount'])
df['CommentCount'] = fill_na_with_rolling_mean(df['CommentCount'])
df['SentimentScore'] = fill_na_with_rolling_mean(df['SentimentScore'])

# Ensure no NaN values remain
df.fillna(0, inplace=True)

# Reset the index to get back the date column
df.reset_index(inplace=True)
df

Unnamed: 0,Date,LikeCount,CommentCount,SentimentScore,Product
0,2023-08-01,15,9,0.9,ChatGPT
1,2023-08-02,22,12,0.9,ChatGPT
2,2023-08-03,15,10,0.9,ChatGPT
3,2023-08-04,14,12,0.7,ChatGPT
4,2023-08-05,16,9,0.9,ChatGPT
5,2023-08-06,17,9,0.6,ChatGPT
6,2023-08-07,15,8,0.9,ChatGPT
7,2023-08-08,18,8,0.8,ChatGPT


In [None]:
# Initialize the new DataFrame for predictions
future_dates = pd.date_range(start='2023-08-09', periods=2, freq='D')
new_df = pd.DataFrame({'Date': future_dates, 'Product': city_id})
new_df

Unnamed: 0,Date,Product
0,2023-08-09,ChatGPT
1,2023-08-10,ChatGPT


In [None]:
# Columns to forecast
column = 'SentimentScore'

# Train and predict for each column
print(f"Training model for {column} for Product {city_id}")
temp_df = df[['Date', column, 'LikeCount', 'CommentCount']].rename(columns={'Date': 'ds', column: 'y'})
temp_df

Training model for SentimentScore for Product ChatGPT


Unnamed: 0,ds,y,LikeCount,CommentCount
0,2023-08-01,0.9,15,9
1,2023-08-02,0.9,22,12
2,2023-08-03,0.9,15,10
3,2023-08-04,0.7,14,12
4,2023-08-05,0.9,16,9
5,2023-08-06,0.6,17,9
6,2023-08-07,0.9,15,8
7,2023-08-08,0.8,18,8


In [None]:
# Train the model
m = Prophet()
m.add_regressor('LikeCount')
m.add_regressor('CommentCount')

m.fit(temp_df)

INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:prophet:n_changepoints greater than number of observations. Using 5.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp1sz51h_y/m5ak6fcj.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp1sz51h_y/suawcli6.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=16836', 'data', 'file=/tmp/tmp1sz51h_y/m5ak6fcj.json', 'init=/tmp/tmp1sz51h_y/suawcli6.json', 'output', 'file=/tmp/tmp1sz51h_y/prophet_modelycmfhtf9/prophet_model-20240711113225.csv', 'method=optimize', 'algorithm=newton', 'iter=10000']
11:32:25 - cmdstanpy - INFO - Chain [1] start processing
IN

AttributeError: 'Prophet' object has no attribute 'score'

In [None]:
# Create future dataframe with additional regressors
future = m.make_future_dataframe(periods=3)
future = future.merge(df[['Date', 'LikeCount', 'CommentCount']], left_on='ds', right_on='Date', how='left')
future.drop(columns=['Date'], inplace=True)
future

Unnamed: 0,ds,LikeCount,CommentCount
0,2023-08-01,15.0,9.0
1,2023-08-02,22.0,12.0
2,2023-08-03,15.0,10.0
3,2023-08-04,14.0,12.0
4,2023-08-05,16.0,9.0
5,2023-08-06,17.0,9.0
6,2023-08-07,15.0,8.0
7,2023-08-08,18.0,8.0
8,2023-08-09,,
9,2023-08-10,,


In [None]:
# Fill NaN values in future dataframe
future['LikeCount'].fillna(future['LikeCount'].mean(), inplace=True)
future['CommentCount'].fillna(future['CommentCount'].mean(), inplace=True)
future

Unnamed: 0,ds,LikeCount,CommentCount
0,2023-08-01,15.0,9.0
1,2023-08-02,22.0,12.0
2,2023-08-03,15.0,10.0
3,2023-08-04,14.0,12.0
4,2023-08-05,16.0,9.0
5,2023-08-06,17.0,9.0
6,2023-08-07,15.0,8.0
7,2023-08-08,18.0,8.0
8,2023-08-09,16.5,9.625
9,2023-08-10,16.5,9.625


In [None]:
# Make future predictions
forecast = m.predict(future)
forecast

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,CommentCount,CommentCount_lower,CommentCount_upper,LikeCount,...,additive_terms,additive_terms_lower,additive_terms_upper,extra_regressors_additive,extra_regressors_additive_lower,extra_regressors_additive_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2023-08-01,0.935919,0.838346,1.067067,0.935919,0.935919,0.024979,0.024979,0.024979,-0.013199,...,0.01178,0.01178,0.01178,0.01178,0.01178,0.01178,0.0,0.0,0.0,0.947699
1,2023-08-02,0.904104,0.740918,0.973816,0.904104,0.904104,-0.094921,-0.094921,-0.094921,0.048396,...,-0.046525,-0.046525,-0.046525,-0.046525,-0.046525,-0.046525,0.0,0.0,0.0,0.857579
2,2023-08-03,0.872288,0.724535,0.956861,0.872288,0.872288,-0.014988,-0.014988,-0.014988,-0.013199,...,-0.028187,-0.028187,-0.028187,-0.028187,-0.028187,-0.028187,0.0,0.0,0.0,0.844102
3,2023-08-04,0.840473,0.608582,0.842292,0.840473,0.840473,-0.094921,-0.094921,-0.094921,-0.021998,...,-0.11692,-0.11692,-0.11692,-0.11692,-0.11692,-0.11692,0.0,0.0,0.0,0.723553
4,2023-08-05,0.808657,0.707492,0.943082,0.808657,0.808657,0.024979,0.024979,0.024979,-0.0044,...,0.02058,0.02058,0.02058,0.02058,0.02058,0.02058,0.0,0.0,0.0,0.829237
5,2023-08-06,0.776842,0.690909,0.921329,0.776842,0.776842,0.024979,0.024979,0.024979,0.0044,...,0.029379,0.029379,0.029379,0.029379,0.029379,0.029379,0.0,0.0,0.0,0.806221
6,2023-08-07,0.745027,0.680484,0.914581,0.745027,0.745027,0.064946,0.064946,0.064946,-0.013199,...,0.051747,0.051747,0.051747,0.051747,0.051747,0.051747,0.0,0.0,0.0,0.796774
7,2023-08-08,0.713211,0.674937,0.900495,0.713211,0.713211,0.064946,0.064946,0.064946,0.013199,...,0.078145,0.078145,0.078145,0.078145,0.078145,0.078145,0.0,0.0,0.0,0.791356
8,2023-08-09,0.681396,0.574191,0.798924,0.681396,0.681396,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.681396
9,2023-08-10,0.649581,0.531195,0.772568,0.649581,0.649581,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.649581


In [None]:
# Extract the predictions for the next 7 days
forecast = forecast[['ds', 'yhat']].tail(4).reset_index(drop=True)
# new_df[column] = forecast['yhat']
# new_df
forecast

Unnamed: 0,ds,yhat
0,2023-08-09,0.681396
1,2023-08-10,0.649581
2,2023-08-11,0.617765
