# Numerical analysis without GridSearch (too much time to execute) but add cross validation

In [15]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import cross_val_score

import pandas as pd

In [16]:
y_label = 'retweetCount'

results_regression_df = pd.DataFrame(columns=["Filename", "Model", "MSE", "R2"])
regression_models = [SVR(),RandomForestRegressor(),MLPRegressor()]

results_classification_df = pd.DataFrame(columns=["Filename", "Model", "Accuracy", "Precision", "Recall", "F1"])
classification_models = [SVC(),RandomForestClassifier(),MLPClassifier()]

filenames = ['tf_idf.csv','doc2vec.csv','numerical.csv', 'tf_idf_doc2vec.csv', 'combinaison.csv']

In [17]:
from sklearn.preprocessing import StandardScaler

def Standarize(X_train, X_test):
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled


In [18]:
from sklearn.model_selection import train_test_split

def DataSplit(dataframe, y_label):

    X = dataframe.drop(y_label, axis=1)
    y = dataframe[y_label] 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train, X_test = Standarize(X_train, X_test)

    return X_train, X_test, y_train, y_test

In [19]:
def FitAndPredict_Regression(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return model, predictions, mse, r2

In [20]:
def FitAndPredict_Classification(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted') 
    f1 = f1_score(y_test, predictions, average='weighted')
    return accuracy, precision, recall, f1

In [21]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label]>5]
    
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, y_label)

    for model in regression_models:
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        mse_scores = -scores
        r2_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
        
        temp_df = pd.DataFrame({
            "Filename": [filename] * 5,
            "Model": [model] * 5,
            "MSE": mse_scores,
            "R2": r2_scores
        })
    
        results_regression_df = pd.concat([results_regression_df, temp_df], ignore_index=True)




In [22]:
results_regression_df['MSE'] = results_regression_df['MSE'].astype(float)

mse_min_index = results_regression_df['MSE'].idxmin()

mse_min_model = results_regression_df.loc[mse_min_index, 'Model']
mse_min_filename = results_regression_df.loc[mse_min_index, 'Filename']
mse_min_mse = results_regression_df.loc[mse_min_index, 'MSE']

results_regression_df['R2'] = results_regression_df['R2'].astype(float)

r2_closest_to_1_index = results_regression_df['R2'].idxmax()

r2_min_model = results_regression_df.loc[r2_closest_to_1_index, 'Model']
r2_min_filename = results_regression_df.loc[r2_closest_to_1_index, 'Filename']
r2_min_r2 = results_regression_df.loc[r2_closest_to_1_index, 'R2']

print("MSE minimum:")
print("Model:", mse_min_model)
print("Filename:", mse_min_filename)
print("MSE:", mse_min_mse)

print("\nR2 (closest to 1):")
print("Model:", r2_min_model)
print("Filename:", r2_min_filename)
print("R2:", r2_min_r2)


MSE minimum:
Model: RandomForestRegressor()
Filename: combinaison.csv
MSE: 14100.05966059322

R2 (closest to 1):
Model: RandomForestRegressor()
Filename: numerical.csv
R2: 0.839916969002783


In [23]:
results_regression_df

Unnamed: 0,Filename,Model,MSE,R2
0,tf_idf.csv,SVR(),146927.124595,-0.065934
1,tf_idf.csv,SVR(),211895.897010,-0.103313
2,tf_idf.csv,SVR(),179091.618984,-0.071192
3,tf_idf.csv,SVR(),53554.269963,-0.112458
4,tf_idf.csv,SVR(),96086.286838,-0.076482
...,...,...,...,...
70,combinaison.csv,MLPRegressor(),36369.705477,0.728872
71,combinaison.csv,MLPRegressor(),86015.320412,0.563461
72,combinaison.csv,MLPRegressor(),53981.809431,0.680041
73,combinaison.csv,MLPRegressor(),21474.145270,0.462791


## BONUS 

In this classification task, the data was divided into 6 classes based on the number of retweets, with each class representing a different range. The classes were sorted in ascending order of retweet counts. The obtained classification score of 0.37 might seem relatively low at first glance. However, considering the context of having 6 classes, a random classifier would achieve an accuracy of approximately 1/6 or around 0.17. Therefore, achieving a score of 0.37 indicates that the classification model performs significantly better than random chance and provides meaningful predictions. It's important to analyze multiple metrics to gain a comprehensive understanding of the model's performance.

In [24]:
def RegressionIntoClassification(dataframe):
    sorted_data = dataframe.sort_values('retweetCount')
    total_tweets = len(sorted_data)
    class_thresholds = [int(total_tweets * i / 6) for i in range(1, 11)]
    dataframe['class'] = ''
    for i in range(len(class_thresholds)):
        if i == 0:
            dataframe.loc[:class_thresholds[i], 'class'] = i+1
        else:
            dataframe.loc[class_thresholds[i-1]+1:class_thresholds[i], 'class'] = i+1
    dataframe.drop('retweetCount', axis=1, inplace=True)
    dataframe['class'] = dataframe['class'].astype(int)
    return dataframe

In [26]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label]>5]
    tweets_df = RegressionIntoClassification(tweets_df)
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, 'class')

    for model in classification_models:
        accuracy_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        precision_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='precision_weighted')
        recall_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='recall_weighted')
        f1_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')

        temp_df = pd.DataFrame({
            "Filename": [filename] * 5,
            "Model": [model] * 5,
            "Accuracy": accuracy_scores,
            "Precision": precision_scores,
            "Recall": recall_scores,
            "F1": f1_scores
        })

        results_classification_df = pd.concat([results_classification_df, temp_df], ignore_index=True)




In [27]:
for metric in ['Accuracy', 'Precision', 'Recall', 'F1']:
    if results_classification_df[metric].dtype != float:
        results_classification_df[metric] = results_classification_df[metric].astype(float)

accuracy_max_index = results_classification_df['Accuracy'].idxmax()
accuracy_max_model = results_classification_df.loc[accuracy_max_index, 'Model']
accuracy_max_filename = results_classification_df.loc[accuracy_max_index, 'Filename']
accuracy_max_accuracy = results_classification_df.loc[accuracy_max_index, 'Accuracy']

print("Best Accuracy:")
print("Model:", accuracy_max_model)
print("Filename:", accuracy_max_filename)
print("Accuracy:", accuracy_max_accuracy)

recall_max_index = results_classification_df['Recall'].idxmax()
recall_max_model = results_classification_df.loc[recall_max_index, 'Model']
recall_max_filename = results_classification_df.loc[recall_max_index, 'Filename']
recall_max_recall = results_classification_df.loc[recall_max_index, 'Recall']

print("Best Recall:")
print("Model:", recall_max_model)
print("Filename:", recall_max_filename)
print("Recall:", recall_max_recall)


Best Accuracy:
Model: RandomForestClassifier()
Filename: numerical.csv
Accuracy: 0.37339055793991416
Best Recall:
Model: RandomForestClassifier()
Filename: numerical.csv
Recall: 0.3653516295025729


In [28]:
results_classification_df

Unnamed: 0,Filename,Model,Accuracy,Precision,Recall,F1,Predictions
0,tf_idf.csv,SVC(),0.318644,0.342614,0.318644,0.323474,"[5, 5, 5, 5, 4, 6, 4, 5, 2, 1, 5, 4, 3, 4, 4, ..."
1,tf_idf.csv,"(DecisionTreeClassifier(max_features='auto', r...",0.318644,0.331794,0.318644,0.322654,"[5, 1, 5, 3, 4, 4, 4, 2, 5, 1, 4, 4, 4, 4, 4, ..."
2,tf_idf.csv,MLPClassifier(),0.283051,0.285386,0.283051,0.283882,"[6, 1, 6, 3, 3, 6, 4, 2, 2, 3, 6, 4, 3, 1, 4, ..."
3,doc2vec.csv,SVC(),0.222034,0.256847,0.222034,0.212531,"[4, 5, 5, 4, 6, 3, 5, 5, 2, 4, 5, 6, 6, 5, 4, ..."
4,doc2vec.csv,"(DecisionTreeClassifier(max_features='auto', r...",0.208475,0.217570,0.208475,0.210937,"[4, 1, 5, 4, 6, 2, 5, 6, 2, 1, 5, 6, 6, 6, 4, ..."
...,...,...,...,...,...,...,...
85,combinaison.csv,MLPClassifier(),0.180085,0.193226,0.228814,0.209902,
86,combinaison.csv,MLPClassifier(),0.190678,0.252092,0.230932,0.238584,
87,combinaison.csv,MLPClassifier(),0.201271,0.184694,0.190678,0.194850,
88,combinaison.csv,MLPClassifier(),0.209746,0.213476,0.194915,0.209749,
