# Numerical analysis without GridSearch (too much time to execute)

In [26]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd

In [27]:
y_label = 'retweetCount'
results_df = pd.DataFrame(columns=["Filename", "Model", "Predictions", "MSE", "R2"])
models = [SVR(),RandomForestRegressor(),MLPRegressor()]
filenames = ['tf_idf.csv','doc2vec.csv','numerical.csv', 'tf_idf_doc2vec.csv', 'combinaison.csv']

In [28]:
from sklearn.preprocessing import StandardScaler

def Standarize(X_train, X_test):
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled


In [29]:
from sklearn.model_selection import train_test_split

def DataSplit(dataframe, y_label):

    X = dataframe.drop(y_label, axis=1)  # Caractéristiques (variables indépendantes)
    y = dataframe[y_label]  # Variable cible (retweetCount)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train, X_test = Standarize(X_train, X_test)

    return X_train, X_test, y_train, y_test

In [30]:
def FitAndPredict(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return model, predictions, mse, r2

In [31]:
for filename in filenames:
    tweets_df = pd.read_csv(filename)
    tweets_df = tweets_df[tweets_df[y_label]>5]
    
    X_train, X_test, y_train, y_test = DataSplit(tweets_df, y_label)

    for model in models:
       trained_model, predictions, mse, r2 = FitAndPredict(model, X_train, X_test, y_train, y_test)
       
       temp_df = pd.DataFrame({
            "Filename": [filename],
            "Model": [trained_model],
            "Predictions": [predictions],
            "MSE": [mse],
            "R2": [r2]
        })
       
       results_df = pd.concat([results_df, temp_df], ignore_index=True)




In [32]:
# Convertir la colonne 'MSE' en float
results_df['MSE'] = results_df['MSE'].astype(float)

# Trouver l'index du MSE minimum
mse_min_index = results_df['MSE'].idxmin()

# Récupérer les valeurs correspondantes
mse_min_model = results_df.loc[mse_min_index, 'Model']
mse_min_filename = results_df.loc[mse_min_index, 'Filename']
mse_min_mse = results_df.loc[mse_min_index, 'MSE']

if results_df['R2'].dtype != float:
    # Convertir la colonne 'R2' en float
    results_df['R2'] = results_df['R2'].astype(float)

# Trouver l'index du R2 minimum
r2_closest_to_1_index = results_df['R2'].idxmax()

# Récupérer les valeurs correspondantes
r2_min_model = results_df.loc[r2_closest_to_1_index, 'Model']
r2_min_filename = results_df.loc[r2_closest_to_1_index, 'Filename']
r2_min_r2 = results_df.loc[r2_closest_to_1_index, 'R2']

# Afficher les résultats
print("MSE minimum:")
print("Model:", mse_min_model)
print("Filename:", mse_min_filename)
print("MSE:", mse_min_mse)

print("\nR2 minimum:")
print("Model:", r2_min_model)
print("Filename:", r2_min_filename)
print("R2:", r2_min_r2)


MSE minimum:
Model: RandomForestRegressor()
Filename: combinaison.csv
MSE: 135217.4582252542

R2 minimum:
Model: RandomForestRegressor()
Filename: combinaison.csv
R2: 0.15526769736649437


In [33]:
results_df

Unnamed: 0,Filename,Model,Predictions,MSE,R2
0,tf_idf.csv,SVR(),"[56.46660065887589, 52.51327880060185, 54.8170...",171204.233158,-0.069549
1,tf_idf.csv,"(DecisionTreeRegressor(max_features='auto', ra...","[209.73, 96.8, 138.6, 186.91, 144.95, 183.38, ...",145871.187162,0.088712
2,tf_idf.csv,MLPRegressor(),"[276.39427433122626, 202.90593568996118, 91.93...",146769.075479,0.083102
3,doc2vec.csv,SVR(),"[56.3779342462577, 55.01803463157481, 57.48925...",171527.819268,-0.071571
4,doc2vec.csv,"(DecisionTreeRegressor(max_features='auto', ra...","[197.03, 111.71, 216.21, 448.29, 155.5, 202.43...",166540.866176,-0.040416
5,doc2vec.csv,MLPRegressor(),"[115.02282698321753, 87.76261433856317, 122.77...",155196.676468,0.030453
6,numerical.csv,SVR(),"[47.98106411565248, 54.36938773726657, 65.9740...",169848.542557,-0.06108
7,numerical.csv,"(DecisionTreeRegressor(max_features='auto', ra...","[100.21, 41.8, 287.49, 55.09, 110.46, 361.78, ...",135325.355833,0.154594
8,numerical.csv,MLPRegressor(),"[99.7739754299408, 193.85449022723938, 182.834...",154668.580988,0.033752
9,tf_idf_doc2vec.csv,SVR(),"[57.04299456499802, 55.464284370680744, 56.411...",171455.184368,-0.071117
