In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data=pd.read_csv('datacleaning.csv')

In [3]:
data.head()

Unnamed: 0,category,rating,label,text_,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",love well made sturdi comfort love pretti
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",love great upgrad origin mine coupl year
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,pillow save back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",miss inform use great product price
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,nice set good qualiti set two month


In [4]:
data['text'].fillna('',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['text'].fillna('',inplace = True)


In [5]:
data['length'] = data['text_'].apply(lambda x: len(x.split()))

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
mapping = {'OR' : 0, 'CG' : 1}
data['label'] = le.fit_transform(data['label'].map(mapping))

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, f1_score,precision_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler
from itertools import product
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2

In [8]:
class Spam:
    def __init__(self, data):
        self.data = data
        self.split, self.X, self.y = self.train_test_split(data=data)

    def train_test_split(self, data):
        kf = KFold(n_splits=5, random_state=42, shuffle=True)
        X = data[['text','length']]
        y = data['label']
        folds = list(kf.split(X))
        return folds,X,y

    def extraction(self, min_df=0.0, ngram=(1,1), X_train=None, X_test=None):
        vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=ngram)
        X_trained = pd.DataFrame(vectorizer.fit_transform(X_train['text']).toarray(), columns = vectorizer.get_feature_names_out(), index=X_train.index)
        X_tested = pd.DataFrame(vectorizer.transform(X_test['text']).toarray(), columns = vectorizer.get_feature_names_out(), index=X_test.index)
        return X_trained, X_tested

    def model_mapping(self, model):
        mapping = {
            'Logistics': LogisticRegression,
            'SVM': SVC,
            'KNN': KNeighborsClassifier,
            'Naive Bayes': GaussianNB,
            'MLP': MLPClassifier
        }
        return model, mapping[model]
    def train_report(self, model, X_train=None, y_train=None, X_test=None, y_test=None,param=None):
        clf = model(**param)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test,y_pred)
        f1 = f1_score(y_test,y_pred)
        precision = precision_score(y_test,y_pred)
        y_pred_roc = clf.predict_proba(X_test)[:,1]
        roc = roc_auc_score(y_test,y_pred_roc)
        return accuracy,recall,f1,precision,roc
    
    def feature_selection(self,X_train = None,X_test = None, method = 'pca'):
        if method == 'pca':
            extract = PCA(n_components = 1000,random_state = 42)
        else:
            extract = SelectKBest(chi2, k=1000)
        X_train = extract.fit_transform(X_train)
        X_test = extract.transform(X_test)
        return X_train,X_test
        
    def model_selection(self, model='Logistics', param =None,min_df=None, ngram=None, method = None):
        accu = []
        reca = []
        f1s = []
        pres = []
        rocau = []
        model_name, model_func = self.model_mapping(model)
        method_used = 'none'
        for train_index, test_index in self.split:
            X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
            y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]

            train_vec, test_vec = self.extraction(min_df=min_df,ngram=ngram,X_train=X_train, X_test=X_test)
            scale = StandardScaler()
            X_train = X_train.copy()
            X_test = X_test.copy()
            train_vec['og_length'] = scale.fit_transform(X_train[['length']])
            test_vec['og_length'] = scale.transform(X_test[['length']])
            if train_vec.shape[1] > 2000:
                if method == 'pca':
                    train_vec, test_vec = self.feature_selection(train_vec, test_vec, method='pca')
                    method_used = 'pca'
                elif method == 'selectkbest':
                    train_vec, test_vec = self.feature_selection(train_vec, test_vec, method='selectkbest')
                    method_used = 'selectkbest'
            else:
                method_used = 'none'
            acc,rec,f1,pre,roc = self.train_report(model = model_func, X_train = train_vec, y_train=y_train, X_test=test_vec, y_test=y_test,param = param)
            accu.append(acc)
            reca.append(rec)
            f1s.append(f1)
            pres.append(pre)
            rocau.append(roc)
        accuracy = np.mean(accu)
        recall = np.mean(reca)
        f1_score = np.mean(f1s)
        precision = np.mean(pres)
        roc_auc = np.mean(rocau)
        return method_used,accuracy,recall,f1_score,precision,roc_auc


def main(model, data, min_df=None, ngram=None, param=None, method = None):
    results_df = pd.DataFrame(columns=['feature_extraction','feature_selection','model', 'accuracy', 'recall', 'f1_score', 'precision', 'roc_auc'])

    spam_instance = Spam(data=data)

    if model == 'KNN':
        knn_params = {
            'n_neighbors': [1, 3, 5],
            'metric': ['cosine', 'euclidean', 'manhattan']
        }
        for knn_param in product(knn_params['n_neighbors'], knn_params['metric']):
            knn_param_dict = {'n_neighbors': knn_param[0], 'metric': knn_param[1]}
            for min_df_val, ngram_val in product(min_df, ngram):
                select, acc, rec, f1, pre, roc = spam_instance.model_selection(model=model, param=knn_param_dict, min_df=min_df_val, ngram=ngram_val)
                model_str = f"{model}{{{min_df_val},{ngram_val}}}"
                results_df_length = len(results_df)
                results_df.loc[results_df_length] = [select, model_str, acc, rec, f1, pre, roc]
    else:
        for min_df_val, ngram_val in product(min_df, ngram):
            select, acc, rec, f1, pre, roc = spam_instance.model_selection(model=model, param=param, min_df=min_df_val, ngram=ngram_val, method = method)
            extraction_str = f"TF_IDF{{{min_df_val},{ngram_val}}}"
            results_df_length = len(results_df)
            results_df.loc[results_df_length] = [extraction_str,select, model, acc, rec, f1, pre, roc]
            print(results_df.loc[results_df_length])
    return results_df

In [15]:
if __name__ == '__main__':

    data=pd.read_csv('datacleaning.csv')
    data['text'].fillna('',inplace = True)
    data=data.copy()
    data['length'] = data['text_'].apply(lambda x: len(x.split()))
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    mapping = {'OR' : 0, 'CG' : 1}
    data['label'] = le.fit_transform(data['label'].map(mapping))
    # check = Spam(data=data)

    # Define parameter combinations for each model
    params = {
        'Logistics': {'max_iter': 1000, 'class_weight': 'balanced', 'random_state': 42}}
        #'SVM': {'probability': True},
        #'Naive Bayes':
    min_df_values = [0.001]
    ngram_values = [(1,1),(1,2),(1, 3),(2, 2),(2, 3),(3, 3)]

    results_df = pd.DataFrame(columns=['feature_extraction','feature_selection','model', 'accuracy', 'recall', 'f1_score', 'precision', 'roc_auc'])

    for model_name, param in params.items():
        results = main(model=model_name, data=data, min_df=min_df_values, ngram=ngram_values, param=param,method = 'selectkbest')
        results_df = pd.concat([results_df, results], ignore_index=True)
    
    print(results_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['text'].fillna('',inplace = True)


feature_extraction    TF_IDF{0.001,(1, 1)}
feature_selection                      pca
model                            Logistics
accuracy                          0.849253
recall                            0.841937
f1_score                          0.848122
precision                         0.854406
roc_auc                           0.927268
Name: 0, dtype: object


KeyboardInterrupt: 

In [10]:
results_df['length_used'] = 'StandardScaler'

In [11]:
results_df['data'] = 'fake_reviews_dataset'

In [12]:
results_df = results_df[['data','length_used','feature_extraction','feature_selection','model', 'accuracy', 'f1_score','recall', 'precision', 'roc_auc']]

In [13]:
#results_df.to_csv('mindf_0.001_0.01_pca_logistics.csv',index=False)

In [14]:
results_df

Unnamed: 0,data,length_used,feature_extraction,feature_selection,model,accuracy,f1_score,recall,precision,roc_auc
0,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(1, 1)}",none,Logistics,0.81156,0.812744,0.818152,0.807415,0.894533
1,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(1, 2)}",none,Logistics,0.825707,0.825884,0.827122,0.824679,0.909541
2,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(1, 3)}",none,Logistics,0.826276,0.826314,0.826918,0.825732,0.909708
3,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(2, 2)}",none,Logistics,0.672141,0.62081,0.536804,0.736009,0.750723
4,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(2, 3)}",none,Logistics,0.672289,0.619996,0.534723,0.737649,0.752459
5,fake_reviews_dataset,StandardScaler,"TF_IDF{0.01,(3, 3)}",none,Logistics,0.591462,0.591582,0.591949,0.591427,0.634581
6,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(1, 1)}",pca,Logistics,0.849253,0.848122,0.841937,0.854406,0.927268
7,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(1, 2)}",pca,Logistics,0.872304,0.871436,0.865889,0.877083,0.946797
8,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(1, 3)}",pca,Logistics,0.874555,0.873539,0.866807,0.880395,0.948573
9,fake_reviews_dataset,StandardScaler,"TF_IDF{0.001,(2, 2)}",pca,Logistics,0.806367,0.796963,0.759895,0.837842,0.885487
