INTERMEDIATE QUESTIONS :
Q-1. Imagine you have a dataset where you have different Instagram features
like u sername , Caption , Hashtag , Followers , Time_Since_posted , and likes , now your task is
to predict the number of likes and Time Since posted and the rest of the features are
your input features. Now you have to build a model which can predict the
number of likes and Time Since posted.


In [162]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df = pd.read_csv('dataset/instagram_reach.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,S.No,USERNAME,Caption,Followers,Hashtags,Time since posted,Likes
0,0,1,mikequindazzi,Who are #DataScientist and what do they do? >>...,1600,#MachineLearning #AI #DataAnalytics #DataScien...,11 hours,139
1,1,2,drgorillapaints,We all know where it’s going. We just have to ...,880,#deck .#mac #macintosh#sayhello #apple #steve...,2 hours,23
2,2,3,aitrading_official,Alexander Barinov: 4 years as CFO in multinati...,255,#whoiswho #aitrading #ai #aitradingteam#instat...,2 hours,25
3,3,4,opensourcedworkplace,sfad,340,#iot #cre#workplace #CDO #bigdata #technology#...,3 hours,49
4,4,5,crea.vision,Ever missed a call while your phone was chargi...,304,#instamachinelearning #instabigdata#instamarke...,3 hours,30


In [163]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   S.No               100 non-null    int64 
 2   USERNAME           100 non-null    object
 3   Caption            94 non-null     object
 4   Followers          100 non-null    int64 
 5   Hashtags           100 non-null    object
 6   Time since posted  100 non-null    object
 7   Likes              100 non-null    int64 
dtypes: int64(4), object(4)
memory usage: 6.4+ KB


In [164]:
df.columns

Index(['Unnamed: 0', 'S.No', 'USERNAME', 'Caption', 'Followers', 'Hashtags',
       'Time since posted', 'Likes'],
      dtype='object')

In [165]:
# drop non useful columns from dataset 
if ('Unnamed: 0' or 'S.No') in df.columns:
    df.drop(['Unnamed: 0', 'S.No'], inplace= True , axis = 1)
# fill missing value with mode 
df['Caption'].fillna(df['Caption'].mode(), inplace= True)

In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   USERNAME           100 non-null    object
 1   Caption            100 non-null    object
 2   Followers          100 non-null    int64 
 3   Hashtags           100 non-null    object
 4   Time since posted  100 non-null    object
 5   Likes              100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [167]:
# we will split caption words and hashtags , as algorithms only understand numerical values
df['Caption_Words_No'] = [len(i) for i in df['Caption'].str.split(' ')]
df['Hashtags_words_No'] = [len(i) for i in df['Hashtags'].str.split(' ')]

In [168]:
df['Time since posted'].unique()
# we will remove hourse from string and covert it to  integer
df['time_since_posted_in_numeric'] =  df['Time since posted'].str.replace(' hours', '').astype('float')
df.dtypes

USERNAME                         object
Caption                          object
Followers                         int64
Hashtags                         object
Time since posted                object
Likes                             int64
Caption_Words_No                  int64
Hashtags_words_No                 int64
time_since_posted_in_numeric    float64
dtype: object

In [169]:
# we will scale data to same scale using standard scaler
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# df = sc.fit_transform(df.select_dtypes(include = 'number'))
# df = pd.DataFrame(df , columns= sc.get_feature_names_out())
# df

### tried scandard scaling for data , but error is increasing , hence skipped the test

In [170]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Select the input features and target variables
input_features = ['Followers', 'Caption_Words_No', 'Hashtags_words_No']
target_variables = ['time_since_posted_in_numeric', 'Likes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[input_features], df[target_variables], test_size=0.2, random_state=42)

# Define a dictionary of models and their hyperparameters for GridSearchCV
models = {
    'Random Forest': {
        'estimator': RandomForestRegressor(),
        'param_grid': {'n_estimators': [100, 50, 300 ,10 , 500 , 5], 'max_depth': [None, 5, 10 , 2 , 1, 4]},
        'scoring': 'neg_mean_squared_error'
    },
    'Gradient Boosting': {
        'estimator': GradientBoostingRegressor(),
        'param_grid': {'n_estimators': [100, 50, 300], 'max_depth': [3, 5, 10 , 2]},
        'scoring': 'neg_mean_squared_error'
    },
    'Support Vector Regression': {
        'estimator': SVR(),
        'param_grid': {'C': [0.1, 1, 10], 'epsilon': [0.1, 0.01, 0.001]},
        'scoring': 'neg_mean_squared_error'
    }
}

# Perform grid search and evaluate multiple models using separate GridSearchCV objects
for model_name, model_params in models.items():
    grid_search = GridSearchCV(model_params['estimator'], param_grid=model_params['param_grid'],scoring = 'r2', cv=10)
    model = MultiOutputRegressor(grid_search)
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    best_params = [model.estimators_[i].best_params_ for i in range(1, len(model.estimators_))]
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print the evaluation metrics
    print("Best Parameters:", best_params , model_name)

    print("Root Mean Squared Error (RMSE):", rmse)
    print("Mean Absolute Error (MAE):", mae)
    print("R-squared (R2):", r2)
    print("\n")


Best Parameters: [{'max_depth': 1, 'n_estimators': 50}] Random Forest
Root Mean Squared Error (RMSE): 26.30322839586846
Mean Absolute Error (MAE): 14.834145400202196
R-squared (R2): 0.337317118064878


Best Parameters: [{'max_depth': 2, 'n_estimators': 50}] Gradient Boosting
Root Mean Squared Error (RMSE): 28.184914438278255
Mean Absolute Error (MAE): 14.05689745441751
R-squared (R2): 0.07021992632640345


Best Parameters: [{'C': 10, 'epsilon': 0.1}] Support Vector Regression
Root Mean Squared Error (RMSE): 26.277444495871908
Mean Absolute Error (MAE): 12.28958437452587
R-squared (R2): -0.09784869028629983




# From above training we wil conclude that random forest with above parameters is best fit model

In [171]:
model = MultiOutputRegressor(RandomForestRegressor(n_estimators= 10, max_depth= 2 , random_state= 45))
model.fit(X_train , y_train)
y_hat = randomforest.predict(X_test)
rmse = mean_squared_error(y_test, y_hat) ** 0.5
mae = mean_absolute_error(y_test, y_hat)
r2 = r2_score(y_test, y_hat)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)
print("\n")


Root Mean Squared Error (RMSE): 27.321273351012156
Mean Absolute Error (MAE): 13.85113860028683
R-squared (R2): 0.30391749728158723


