# Numerical analysis with GridSearch and CrossValidation

In [1]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import cross_val_score, GridSearchCV

import pandas as pd

In [2]:
y_label = 'retweetCount'

results_regression_df = pd.DataFrame(columns=["Filename", "Model", "MSE", "R2"])
regression_models = [SVR(),RandomForestRegressor(),MLPRegressor()]

results_classification_df = pd.DataFrame(columns=["Filename", "Model", "Accuracy", "Precision", "Recall", "F1"])
classification_models = [SVC(),RandomForestClassifier(),MLPClassifier()]

filenames = ['tf_idf.csv','doc2vec.csv','numerical.csv', 'tf_idf_doc2vec.csv', 'combinaison.csv']

In [3]:
from sklearn.preprocessing import StandardScaler

def Standarize(X_train, X_test):
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled


In [4]:
from sklearn.model_selection import train_test_split

def DataSplit(dataframe, y_label):
    X = dataframe.drop(y_label, axis=1)
    y = dataframe[y_label] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_test = Standarize(X_train, X_test)
    return X_train, X_test, y_train, y_test

In [5]:
def FitAndPredict_Regression(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return model, predictions, mse, r2

In [6]:
def FitAndPredict_Classification(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted') 
    f1 = f1_score(y_test, predictions, average='weighted')
    return accuracy, precision, recall, f1

In [7]:
#GridSearchCV paremeters
# Define the parameter grids for GridSearchCV
parameters = [
    {
        'kernel': ['linear', 'poly', 'rbf'],
        'C': [0.1, 1, 10]
    },
    {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10]
    },
    {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01]
    }
]


In [11]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label] > 5]
    
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, y_label)

    for i, model in enumerate(regression_models):
        grid_search = GridSearchCV(estimator=model, param_grid=parameters[i], scoring='neg_mean_squared_error', cv=5)
        grid_search.fit(X_train, y_train)
        
        best_params = grid_search.best_params_
        #best_score = grid_search.best_score_
        #mts_scores = -grid_search.cv_results_['mean_test_score']
        
        model.set_params(**best_params) 
        
        
        r2_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        mse_scores = -scores
        
        temp_df = pd.DataFrame({
            "Filename": [filename] * 5,
            "Model": [model] * 5,
            "MSE": mse_scores,
            "R2": r2_scores
        })
    
        results_regression_df = pd.concat([results_regression_df, temp_df], ignore_index=True)




In [12]:
results_regression_df['MSE'] = results_regression_df['MSE'].astype(float)

mse_min_index = results_regression_df['MSE'].idxmin()

mse_min_model = results_regression_df.loc[mse_min_index, 'Model']
mse_min_filename = results_regression_df.loc[mse_min_index, 'Filename']
mse_min_mse = results_regression_df.loc[mse_min_index, 'MSE']

results_regression_df['R2'] = results_regression_df['R2'].astype(float)

r2_closest_to_1_index = results_regression_df['R2'].idxmax()

r2_min_model = results_regression_df.loc[r2_closest_to_1_index, 'Model']
r2_min_filename = results_regression_df.loc[r2_closest_to_1_index, 'Filename']
r2_min_r2 = results_regression_df.loc[r2_closest_to_1_index, 'R2']

print("MSE minimum:")
print("Model:", mse_min_model)
print("Filename:", mse_min_filename)
print("MSE:", mse_min_mse)

print("\nR2 (closest to 1):")
print("Model:", r2_min_model)
print("Filename:", r2_min_filename)
print("R2:", r2_min_r2)


MSE minimum:
Model: RandomForestRegressor(n_estimators=200)
Filename: combinaison.csv
MSE: 13159.479578919492

R2 (closest to 1):
Model: MLPRegressor(hidden_layer_sizes=(100, 50))
Filename: combinaison.csv
R2: 0.8588610817494147


In [13]:
results_regression_df

Unnamed: 0,Filename,Model,MSE,R2
0,tf_idf.csv,"SVR(C=10, kernel='linear')",145931.095362,-0.058708
1,tf_idf.csv,"SVR(C=10, kernel='linear')",209702.030729,-0.091890
2,tf_idf.csv,"SVR(C=10, kernel='linear')",177660.077198,-0.062630
3,tf_idf.csv,"SVR(C=10, kernel='linear')",53152.167006,-0.104105
4,tf_idf.csv,"SVR(C=10, kernel='linear')",96112.698115,-0.076777
...,...,...,...,...
70,combinaison.csv,"MLPRegressor(hidden_layer_sizes=(100, 50))",19697.722977,0.858861
71,combinaison.csv,"MLPRegressor(hidden_layer_sizes=(100, 50))",65423.553606,0.651214
72,combinaison.csv,"MLPRegressor(hidden_layer_sizes=(100, 50))",38623.985898,0.784990
73,combinaison.csv,"MLPRegressor(hidden_layer_sizes=(100, 50))",18841.833840,0.616541


## BONUS 

In this classification task, the data was divided into 6 classes based on the number of retweets, with each class representing a different range. The classes were sorted in ascending order of retweet counts. The obtained classification score of 0.37 might seem relatively low at first glance. However, considering the context of having 6 classes, a random classifier would achieve an accuracy of approximately 1/6 or around 0.17. Therefore, achieving a score of 0.37 indicates that the classification model performs significantly better than random chance and provides meaningful predictions. It's important to analyze multiple metrics to gain a comprehensive understanding of the model's performance.

In [14]:
def RegressionIntoClassification(dataframe):
    sorted_data = dataframe.sort_values('retweetCount')
    total_tweets = len(sorted_data)
    class_thresholds = [int(total_tweets * i / 6) for i in range(1, 11)]
    dataframe['class'] = ''
    for i in range(len(class_thresholds)):
        if i == 0:
            dataframe.loc[:class_thresholds[i], 'class'] = i+1
        else:
            dataframe.loc[class_thresholds[i-1]+1:class_thresholds[i], 'class'] = i+1
    dataframe.drop('retweetCount', axis=1, inplace=True)
    dataframe['class'] = dataframe['class'].astype(int)
    return dataframe

In [16]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label] > 5]
    
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, y_label)

    for i, model in enumerate(classification_models):
        grid_search = GridSearchCV(estimator=model, param_grid=parameters[i], scoring='accuracy', cv=5)
        grid_search.fit(X_train, y_train)
        
        best_params = grid_search.best_params_
        #best_score = grid_search.best_score_
        #accuracy_scores = grid_search.cv_results_['mean_test_score']
        
        model.set_params(**best_params)  # Utilisation des meilleurs paramètres pour le modèle
        
        precision_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='precision_weighted')
        recall_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='recall_weighted')
        f1_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
        accuracy_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        
        temp_df = pd.DataFrame({
            "Filename": [filename] * 5,
            "Model": [model] * 5,
            "Accuracy": accuracy_scores,
            "Precision": precision_scores,
            "Recall": recall_scores,
            "F1": f1_scores
        })
    
        results_classification_df = pd.concat([results_classification_df, temp_df], ignore_index=True)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [None]:
for metric in ['Accuracy', 'Precision', 'Recall', 'F1']:
    if results_classification_df[metric].dtype != float:
        results_classification_df[metric] = results_classification_df[metric].astype(float)

accuracy_max_index = results_classification_df['Accuracy'].idxmax()
accuracy_max_model = results_classification_df.loc[accuracy_max_index, 'Model']
accuracy_max_filename = results_classification_df.loc[accuracy_max_index, 'Filename']
accuracy_max_accuracy = results_classification_df.loc[accuracy_max_index, 'Accuracy']

print("Best Accuracy:")
print("Model:", accuracy_max_model)
print("Filename:", accuracy_max_filename)
print("Accuracy:", accuracy_max_accuracy)

recall_max_index = results_classification_df['Recall'].idxmax()
recall_max_model = results_classification_df.loc[recall_max_index, 'Model']
recall_max_filename = results_classification_df.loc[recall_max_index, 'Filename']
recall_max_recall = results_classification_df.loc[recall_max_index, 'Recall']

print("Best Recall:")
print("Model:", recall_max_model)
print("Filename:", recall_max_filename)
print("Recall:", recall_max_recall)


Best Accuracy:
Model: RandomForestClassifier()
Filename: numerical.csv
Accuracy: 0.36363636363636365


In [None]:
results_classification_df

Unnamed: 0,Filename,Model,Predictions,Accuracy,Precision,Recall,F1
0,tf_idf.csv,SVC(),"[5, 5, 5, 5, 4, 6, 4, 5, 2, 1, 5, 4, 3, 4, 4, ...",0.318644,0.342614,0.318644,0.323474
1,tf_idf.csv,"(DecisionTreeClassifier(max_features='auto', r...","[6, 5, 5, 5, 4, 2, 2, 2, 6, 1, 4, 6, 2, 4, 4, ...",0.291525,0.30153,0.291525,0.293888
2,tf_idf.csv,MLPClassifier(),"[3, 1, 5, 1, 2, 6, 4, 2, 2, 5, 5, 4, 3, 2, 5, ...",0.284746,0.288662,0.284746,0.28558
3,doc2vec.csv,SVC(),"[4, 5, 5, 4, 6, 3, 5, 5, 2, 4, 5, 6, 6, 5, 4, ...",0.222034,0.256847,0.222034,0.212531
4,doc2vec.csv,"(DecisionTreeClassifier(max_features='auto', r...","[4, 5, 3, 3, 6, 2, 5, 5, 2, 1, 1, 6, 6, 5, 4, ...",0.186441,0.195404,0.186441,0.188414
5,doc2vec.csv,MLPClassifier(),"[2, 2, 5, 3, 6, 4, 5, 3, 1, 3, 5, 6, 6, 5, 4, ...",0.20678,0.205621,0.20678,0.205521
6,numerical.csv,SVC(),"[6, 4, 4, 6, 5, 3, 1, 1, 3, 6, 1, 5, 6, 3, 1, ...",0.178388,0.182952,0.178388,0.174947
7,numerical.csv,"(DecisionTreeClassifier(max_features='auto', r...","[2, 5, 4, 2, 6, 1, 4, 2, 3, 2, 3, 5, 6, 1, 3, ...",0.363636,0.364234,0.363636,0.362818
8,numerical.csv,MLPClassifier(),"[6, 1, 4, 2, 5, 3, 1, 2, 3, 6, 1, 4, 1, 4, 1, ...",0.19211,0.192782,0.19211,0.191326
9,tf_idf_doc2vec.csv,SVC(),"[4, 5, 5, 4, 6, 3, 5, 5, 2, 4, 5, 6, 6, 5, 4, ...",0.222034,0.256847,0.222034,0.212531
