In [3]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb

In [5]:
data=pd.read_csv('datacleaning.csv')

In [6]:
data['text'].fillna('',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['text'].fillna('',inplace = True)


In [7]:
data['length'] = data['text_'].apply(lambda x: len(x.split()))

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
mapping = {'OR' : 0, 'CG' : 1}
data['label'] = le.fit_transform(data['label'].map(mapping))

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score,precision_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler
from itertools import product
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import AdaBoostClassifier

In [10]:
class Spam:
    def __init__(self, data):
        self.data = data
        self.split, self.X, self.y = self.train_test_split(data=data)

    def train_test_split(self, data):
        kf = KFold(n_splits=5, random_state=42, shuffle=True)
        X = data[['text','length']]
        y = data['label']
        folds = list(kf.split(X))
        return folds,X,y

    def extraction(self, min_df=0.0, ngram=(1,1), X_train=None, X_test=None):
        vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram)
        X_trained = pd.DataFrame(vectorizer.fit_transform(X_train['text']).toarray(), columns = vectorizer.get_feature_names_out(), index=X_train.index)
        X_tested = pd.DataFrame(vectorizer.transform(X_test['text']).toarray(), columns = vectorizer.get_feature_names_out(), index=X_test.index)
        return X_trained, X_tested

    def model_mapping(self, model):
        mapping = {
            'Logistics': LogisticRegression,
            'SVM': SVC,
            'KNN': KNeighborsClassifier,
            'Naive Bayes': GaussianNB,
            'MLP': MLPClassifier,
            'LightGBM' : lgb.LGBMClassifier,
            'AdaBoost' : AdaBoostClassifier,
            'CatBoost' : CatBoostClassifier,
            'XGBoost' : xgb.XGBClassifier
        }
        return model, mapping[model]
    def train_report(self, model, X_train=None, y_train=None, X_test=None, y_test=None,param=None):
        clf = model(**param)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test,y_pred)
        f1 = f1_score(y_test,y_pred)
        precision = precision_score(y_test,y_pred)
        y_pred_roc = clf.predict_proba(X_test)[:,1]
        roc = roc_auc_score(y_test,y_pred_roc)
        return accuracy,recall,f1,precision,roc
    
    def feature_selection(self,X_train = None,X_test = None, method = 'pca'):
        if method == 'pca':
            extract = PCA(n_components = 1000,random_state = 42)
        else:
            extract = SelectKBest(chi2, k=1000)
        X_train = extract.fit_transform(X_train)
        X_test = extract.transform(X_test)
        return X_train,X_test
        
    def model_selection(self, model='Logistics', param =None,min_df=None, ngram=None, method = None):
        accu = []
        reca = []
        f1s = []
        pres = []
        rocau = []
        model_name, model_func = self.model_mapping(model)
        method_used = 'none'
        for train_index, test_index in self.split:
            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]

            train_vec, test_vec = self.extraction(min_df=min_df,ngram=ngram,X_train=X_train, X_test=X_test)
            scale = StandardScaler()
            X_train = X_train.copy()
            X_test = X_test.copy()
            train_vec['og_length'] = scale.fit_transform(X_train[['length']])
            test_vec['og_length'] = scale.transform(X_test[['length']])
            if train_vec.shape[1] > 2000:
                if method == 'pca':
                    train_vec, test_vec = self.feature_selection(train_vec, test_vec, method='pca')
                    method_used = 'pca'
                elif method == 'selectkbest':
                    train_vec, test_vec = self.feature_selection(train_vec, test_vec, method='selectkbest')
                    method_used = 'selectkbest'
            else:
                method_used = 'none'
            acc,rec,f1,pre,roc = self.train_report(model = model_func, X_train = train_vec, y_train=y_train, X_test=test_vec, y_test=y_test,param = param)
            accu.append(acc)
            reca.append(rec)
            f1s.append(f1)
            pres.append(pre)
            rocau.append(roc)
        accuracy = np.mean(accu)
        recall = np.mean(reca)
        f1_score = np.mean(f1s)
        precision = np.mean(pres)
        roc_auc = np.mean(rocau)
        return method_used,accuracy,recall,f1_score,precision,roc_auc


def main(model, data, min_df=None, ngram=None, param=None, method = 'pca'):
    results_df = pd.DataFrame(columns=['feature_extraction','feature_selection','model', 'accuracy', 'recall', 'f1_score', 'precision', 'roc_auc'])

    spam_instance = Spam(data=data)

    if model == 'KNN':
        knn_params = {
            'n_neighbors': [1, 3, 5],
            'metric': ['cosine', 'euclidean', 'manhattan']
        }
        for knn_param in product(knn_params['n_neighbors'], knn_params['metric']):
            knn_param_dict = {'n_neighbors': knn_param[0], 'metric': knn_param[1]}
            for min_df_val, ngram_val in product(min_df, ngram):
                select, acc, rec, f1, pre, roc = spam_instance.model_selection(model=model, param=knn_param_dict, min_df=min_df_val, ngram=ngram_val)
                model_str = f"{model}{{{min_df_val},{ngram_val}}}"
                results_df_length = len(results_df)
                results_df.loc[results_df_length] = [select, model_str, acc, rec, f1, pre, roc]
    else:
        for min_df_val, ngram_val in product(min_df, ngram):
            select, acc, rec, f1, pre, roc = spam_instance.model_selection(model=model, param=param, min_df=min_df_val, ngram=ngram_val, method = method)
            extraction_str = f"TF_IDF{{{min_df_val},{ngram_val}}}"
            results_df_length = len(results_df)
            results_df.loc[results_df_length] = [extraction_str,select, model, acc, rec, f1, pre, roc]
            print(results_df.loc[results_df_length])
    return results_df

In [11]:
if __name__ == '__main__':

    data=pd.read_csv('datacleaning.csv')
    data['text'].fillna('',inplace = True)
    data=data.copy()
    data['length'] = data['text_'].apply(lambda x: len(x.split()))
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    mapping = {'OR' : 0, 'CG' : 1}
    data['label'] = le.fit_transform(data['label'].map(mapping))
    # check = Spam(data=data)

    # Define parameter combinations for each model
    params = {
        'CatBoost' : {},
        'XGBoost' : {}
        #'AdaBoost' :{}
        #'LightGBM':{}
        #'Logistics': {'max_iter': 1000, 'class_weight': 'balanced', 'random_state': 42}}
        #'SVM': {'probability': True}}
        #'Naive Bayes':
    }
    min_df_values = [0.01,0.001]
    ngram_values = [(1,1),(1,2),(1, 3),(2, 2),(2, 3),(3, 3)]

    results_df = pd.DataFrame(columns=['feature_extraction','feature_selection','model', 'accuracy', 'recall', 'f1_score', 'precision', 'roc_auc'])

    for model_name, param in params.items():
        results = main(model=model_name, data=data, min_df=min_df_values, ngram=ngram_values, param=param,method = 'pca')
        results_df = pd.concat([results_df, results], ignore_index=True)
    
    print(results_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['text'].fillna('',inplace = True)


Learning rate set to 0.045459
0:	learn: 0.6869899	total: 561ms	remaining: 9m 20s
1:	learn: 0.6812944	total: 795ms	remaining: 6m 36s
2:	learn: 0.6748365	total: 1.07s	remaining: 5m 54s
3:	learn: 0.6690773	total: 1.3s	remaining: 5m 25s
4:	learn: 0.6639143	total: 1.52s	remaining: 5m 2s
5:	learn: 0.6589606	total: 1.66s	remaining: 4m 35s
6:	learn: 0.6547102	total: 1.83s	remaining: 4m 20s
7:	learn: 0.6509010	total: 2.05s	remaining: 4m 14s
8:	learn: 0.6474843	total: 2.24s	remaining: 4m 7s
9:	learn: 0.6432165	total: 2.44s	remaining: 4m 1s
10:	learn: 0.6393365	total: 2.66s	remaining: 3m 59s
11:	learn: 0.6363758	total: 2.9s	remaining: 3m 58s
12:	learn: 0.6327272	total: 3.1s	remaining: 3m 55s
13:	learn: 0.6300318	total: 3.28s	remaining: 3m 50s
14:	learn: 0.6276059	total: 3.49s	remaining: 3m 49s
15:	learn: 0.6247761	total: 3.73s	remaining: 3m 49s
16:	learn: 0.6221701	total: 3.89s	remaining: 3m 45s
17:	learn: 0.6198529	total: 4.09s	remaining: 3m 43s
18:	learn: 0.6172304	total: 4.29s	remaining: 3m 41

  results_df = pd.concat([results_df, results], ignore_index=True)


feature_extraction    TF_IDF{0.01,(1, 1)}
feature_selection                    none
model                             XGBoost
accuracy                         0.839855
recall                           0.837366
f1_score                         0.839445
precision                        0.841547
roc_auc                          0.925107
Name: 0, dtype: object
feature_extraction    TF_IDF{0.01,(1, 2)}
feature_selection                    none
model                             XGBoost
accuracy                         0.844653
recall                           0.835578
f1_score                         0.843217
precision                        0.851008
roc_auc                          0.930152
Name: 1, dtype: object
feature_extraction    TF_IDF{0.01,(1, 3)}
feature_selection                    none
model                             XGBoost
accuracy                         0.844875
recall                           0.835606
f1_score                         0.843399
precision                     

In [12]:
results_df

Unnamed: 0,feature_extraction,feature_selection,model,accuracy,recall,f1_score,precision,roc_auc
0,"TF_IDF{0.01,(1, 1)}",none,CatBoost,0.8495,0.849083,0.849414,0.849765,0.932593
1,"TF_IDF{0.01,(1, 2)}",none,CatBoost,0.85692,0.851101,0.856069,0.861098,0.937517
2,"TF_IDF{0.01,(1, 3)}",none,CatBoost,0.85603,0.85044,0.855192,0.859999,0.937545
3,"TF_IDF{0.01,(2, 2)}",none,CatBoost,0.738722,0.641032,0.710376,0.796716,0.810574
4,"TF_IDF{0.01,(2, 3)}",none,CatBoost,0.738697,0.640849,0.710305,0.796816,0.810762
5,"TF_IDF{0.01,(3, 3)}",none,CatBoost,0.631777,0.391798,0.515394,0.753945,0.679015
6,"TF_IDF{0.001,(1, 1)}",pca,CatBoost,0.857316,0.842099,0.855075,0.868513,0.93463
7,"TF_IDF{0.001,(1, 2)}",pca,CatBoost,0.860086,0.847614,0.858306,0.869303,0.940182
8,"TF_IDF{0.001,(1, 3)}",pca,CatBoost,0.863004,0.850893,0.861303,0.872028,0.940697
9,"TF_IDF{0.001,(2, 2)}",pca,CatBoost,0.823976,0.804264,0.820462,0.837343,0.902855


In [13]:
results_df['length_used'] = 'StandardScaler'

In [14]:
results_df['data'] = 'fake_reviews_dataset'

In [15]:
results_df = results_df[['data','length_used','feature_extraction','feature_selection','model', 'accuracy', 'f1_score','recall', 'precision', 'roc_auc']]

In [16]:
#results_df.to_csv('mindf_0.001_0.01_pca_logistics.csv',index=False)

In [17]:
results_df

Unnamed: 0,data,length_used,feature_extraction,feature_selection,model,accuracy,f1_score,recall,precision,roc_auc
0,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(1, 1)}",none,CatBoost,0.8495,0.849414,0.849083,0.849765,0.932593
1,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(1, 2)}",none,CatBoost,0.85692,0.856069,0.851101,0.861098,0.937517
2,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(1, 3)}",none,CatBoost,0.85603,0.855192,0.85044,0.859999,0.937545
3,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(2, 2)}",none,CatBoost,0.738722,0.710376,0.641032,0.796716,0.810574
4,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(2, 3)}",none,CatBoost,0.738697,0.710305,0.640849,0.796816,0.810762
5,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(3, 3)}",none,CatBoost,0.631777,0.515394,0.391798,0.753945,0.679015
6,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(1, 1)}",pca,CatBoost,0.857316,0.855075,0.842099,0.868513,0.93463
7,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(1, 2)}",pca,CatBoost,0.860086,0.858306,0.847614,0.869303,0.940182
8,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(1, 3)}",pca,CatBoost,0.863004,0.861303,0.850893,0.872028,0.940697
9,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(2, 2)}",pca,CatBoost,0.823976,0.820462,0.804264,0.837343,0.902855


In [19]:
results_df.to_csv('mindf_0.001_0.01_pca_boost.csv',index=False)