# Numerical analysis without GridSearch (too much time to execute)

In [86]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import pandas as pd

In [87]:
y_label = 'retweetCount'

results_regression_df = pd.DataFrame(columns=["Filename", "Model", "Predictions", "MSE", "R2"])
regression_models = [SVR(),RandomForestRegressor(),MLPRegressor()]

results_classification_df = pd.DataFrame(columns=["Filename", "Model", "Predictions", "Accuracy", "Precision", "Recall", "F1"])
classification_models = [SVC(),RandomForestClassifier(),MLPClassifier()]

filenames = ['tf_idf.csv','doc2vec.csv','numerical.csv', 'tf_idf_doc2vec.csv', 'combinaison.csv']

In [88]:
from sklearn.preprocessing import StandardScaler

def Standarize(X_train, X_test):
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled


In [89]:
from sklearn.model_selection import train_test_split

def DataSplit(dataframe, y_label):

    X = dataframe.drop(y_label, axis=1)
    y = dataframe[y_label] 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train, X_test = Standarize(X_train, X_test)

    return X_train, X_test, y_train, y_test

In [90]:
def FitAndPredict_Regression(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return model, predictions, mse, r2

In [91]:
def FitAndPredict_Classification(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted') 
    f1 = f1_score(y_test, predictions, average='weighted')
    return model, predictions, accuracy, precision, recall, f1

In [92]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label]>5]
    
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, y_label)

    for model in regression_models:
       trained_model, predictions, mse, r2 = FitAndPredict_Regression(model, X_train, X_test, y_train, y_test)
       
       temp_df = pd.DataFrame({
            "Filename": [filename],
            "Model": [trained_model],
            "Predictions": [predictions],
            "MSE": [mse],
            "R2": [r2]
        })
       
       results_regression_df = pd.concat([results_regression_df, temp_df], ignore_index=True)




In [93]:
results_regression_df['MSE'] = results_regression_df['MSE'].astype(float)

mse_min_index = results_regression_df['MSE'].idxmin()

mse_min_model = results_regression_df.loc[mse_min_index, 'Model']
mse_min_filename = results_regression_df.loc[mse_min_index, 'Filename']
mse_min_mse = results_regression_df.loc[mse_min_index, 'MSE']

if results_regression_df['R2'].dtype != float:
    results_regression_df['R2'] = results_regression_df['R2'].astype(float)

r2_closest_to_1_index = results_regression_df['R2'].idxmax()

r2_min_model = results_regression_df.loc[r2_closest_to_1_index, 'Model']
r2_min_filename = results_regression_df.loc[r2_closest_to_1_index, 'Filename']
r2_min_r2 = results_regression_df.loc[r2_closest_to_1_index, 'R2']

print("MSE minimum:")
print("Model:", mse_min_model)
print("Filename:", mse_min_filename)
print("MSE:", mse_min_mse)

print("\nR2 (closet to 1):")
print("Model:", r2_min_model)
print("Filename:", r2_min_filename)
print("R2:", r2_min_r2)


MSE minimum:
Model: MLPRegressor()
Filename: numerical.csv
MSE: 22184.651482541776

R2 (closet to 1):
Model: MLPRegressor()
Filename: combinaison.csv
R2: 0.757106155561156


In [94]:
results_regression_df

Unnamed: 0,Filename,Model,Predictions,MSE,R2
0,tf_idf.csv,SVR(),"[56.46660065887589, 52.51327880060185, 54.8170...",171204.233158,-0.069549
1,tf_idf.csv,"(DecisionTreeRegressor(max_features='auto', ra...","[268.72, 110.7, 138.03, 270.64, 177.26, 111.8,...",144487.808454,0.097354
2,tf_idf.csv,MLPRegressor(),"[339.32768311295234, 216.94403450033937, 97.78...",147164.052571,0.080635
3,doc2vec.csv,SVR(),"[56.3779342462577, 55.01803463157481, 57.48925...",171527.819268,-0.071571
4,doc2vec.csv,"(DecisionTreeRegressor(max_features='auto', ra...","[136.52, 89.94, 153.5, 290.13, 135.5, 179.69, ...",165921.312926,-0.036546
5,doc2vec.csv,MLPRegressor(),"[124.6817593835479, 82.28729448939416, 120.349...",154789.057969,0.033
6,numerical.csv,SVR(),"[64.71380296839794, 45.64158035733195, 39.7027...",88139.763646,-0.002382
7,numerical.csv,"(DecisionTreeRegressor(max_features='auto', ra...","[84.94, 80.1, 25.57, 35.39, 30.67, 34.48, 39.6...",29386.800379,0.665794
8,numerical.csv,MLPRegressor(),"[61.20848984118332, 94.75964123547945, 51.6630...",22184.651483,0.747702
9,tf_idf_doc2vec.csv,SVR(),"[56.3779342462577, 55.01803463157481, 57.48925...",171527.819268,-0.071571


## BONUS 

In this classification task, the data was divided into 6 classes based on the number of retweets, with each class representing a different range. The classes were sorted in ascending order of retweet counts. The obtained classification score of 0.37 might seem relatively low at first glance. However, considering the context of having 6 classes, a random classifier would achieve an accuracy of approximately 1/6 or around 0.17. Therefore, achieving a score of 0.37 indicates that the classification model performs significantly better than random chance and provides meaningful predictions. It's important to analyze multiple metrics to gain a comprehensive understanding of the model's performance.

In [95]:
def RegressionIntoClassification(dataframe):
    sorted_data = dataframe.sort_values('retweetCount')
    total_tweets = len(sorted_data)
    class_thresholds = [int(total_tweets * i / 6) for i in range(1, 11)]
    dataframe['class'] = ''
    for i in range(len(class_thresholds)):
        if i == 0:
            dataframe.loc[:class_thresholds[i], 'class'] = i+1
        else:
            dataframe.loc[class_thresholds[i-1]+1:class_thresholds[i], 'class'] = i+1
    dataframe.drop('retweetCount', axis=1, inplace=True)
    dataframe['class'] = dataframe['class'].astype(int)
    return dataframe

In [96]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label]>5]
    tweets_df = RegressionIntoClassification(tweets_df)
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, 'class')

    for model in classification_models:
        model, predictions, accuracy, precision, recall, f1 = FitAndPredict_Classification(model, X_train, X_test, y_train, y_test)
        
        temp_df = pd.DataFrame({
                "Filename": [filename],
                "Model": [model],
                "Predictions": [predictions],
                "Accuracy": [accuracy],
                "Precision": [precision],
                "Recall" : [recall],
                "F1" : [f1]
            })
        
        results_classification_df = pd.concat([results_classification_df, temp_df], ignore_index=True)



In [97]:
for metric in ['Accuracy', 'Precision', 'Recall', 'F1']:
    if results_classification_df[metric].dtype != float:
        results_classification_df[metric] = results_classification_df[metric].astype(float)

accuracy_max_index = results_classification_df['Accuracy'].idxmax()

accuracy_max_model = results_classification_df.loc[accuracy_max_index, 'Model']
accuracy_max_filename = results_classification_df.loc[accuracy_max_index, 'Filename']
accuracy_max_accuracy = results_classification_df.loc[accuracy_max_index, 'Accuracy']

print("Best Accuracy:")
print("Model:", accuracy_max_model)
print("Filename:", accuracy_max_filename)
print("Accuracy:", accuracy_max_accuracy)

recall_max_index = results_classification_df['Recall'].idxmax()

recall_max_model = results_classification_df.loc[recall_max_index, 'Model']
recall_max_filename = results_classification_df.loc[recall_max_index, 'Filename']
recall_max_recall = results_classification_df.loc[recall_max_index, 'Recall']


Best Accuracy:
Model: RandomForestClassifier()
Filename: numerical.csv
Accuracy: 0.37221269296740994


In [98]:
results_classification_df

Unnamed: 0,Filename,Model,Predictions,Accuracy,Precision,Recall,F1
0,tf_idf.csv,SVC(),"[5, 5, 5, 5, 4, 6, 4, 5, 2, 1, 5, 4, 3, 4, 4, ...",0.318644,0.342614,0.318644,0.323474
1,tf_idf.csv,"(DecisionTreeClassifier(max_features='auto', r...","[4, 5, 6, 1, 4, 6, 5, 2, 2, 1, 5, 4, 3, 5, 1, ...",0.313559,0.322756,0.313559,0.314026
2,tf_idf.csv,MLPClassifier(),"[5, 2, 4, 4, 6, 6, 4, 2, 5, 5, 5, 4, 5, 3, 4, ...",0.294915,0.295762,0.294915,0.29427
3,doc2vec.csv,SVC(),"[4, 5, 5, 4, 6, 3, 5, 5, 2, 4, 5, 6, 6, 5, 4, ...",0.222034,0.256847,0.222034,0.212531
4,doc2vec.csv,"(DecisionTreeClassifier(max_features='auto', r...","[6, 5, 2, 3, 6, 2, 5, 4, 2, 1, 5, 3, 6, 2, 2, ...",0.19322,0.205943,0.19322,0.197612
5,doc2vec.csv,MLPClassifier(),"[6, 2, 4, 3, 6, 3, 4, 6, 2, 3, 1, 6, 6, 5, 4, ...",0.19322,0.195589,0.19322,0.191526
6,numerical.csv,SVC(),"[6, 4, 4, 6, 5, 3, 1, 1, 3, 6, 1, 5, 6, 3, 1, ...",0.178388,0.182952,0.178388,0.174947
7,numerical.csv,"(DecisionTreeClassifier(max_features='auto', r...","[2, 5, 6, 2, 5, 1, 4, 2, 3, 1, 3, 5, 6, 1, 3, ...",0.372213,0.379827,0.372213,0.373684
8,numerical.csv,MLPClassifier(),"[2, 4, 4, 6, 5, 3, 1, 1, 3, 4, 1, 4, 6, 4, 1, ...",0.176672,0.180918,0.176672,0.177647
9,tf_idf_doc2vec.csv,SVC(),"[4, 5, 5, 4, 6, 3, 5, 5, 2, 4, 5, 6, 6, 5, 4, ...",0.222034,0.256847,0.222034,0.212531
