In [1]:
!pip3 install lightgbm



In [2]:
!pip3 install catboost



# Importing Packages

In [3]:
import time
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [4]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

In [6]:
pd.set_option('max_colwidth', None)

# Loading Data

In [7]:
df = pd.read_csv('fake reviews dataset.csv', names=['category', 'rating', 'label', 'text'])

In [8]:
df.head()

Unnamed: 0,category,rating,label,text
0,category,rating,label,text_
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"


# Check for data imbalance

In [9]:
df['label'].value_counts()

CG       20216
OR       20216
label        1
Name: label, dtype: int64

# Prepare the data

In [10]:
df['text'] = df['text'].str.replace('\n', ' ')

In [11]:
df['target'] = np.where(df['label']=='CG', 1, 0)

In [12]:
df['target'].value_counts()

0    20217
1    20216
Name: target, dtype: int64

# Create features from punctuation

In [13]:
def punctuation_to_features(df, column):
    """Identify punctuation within a column and convert to a text representation.
    
    Args:
        df (object): Pandas dataframe.
        column (string): Name of column containing text. 
        
    Returns:
        df[column]: Original column with punctuation converted to text, 
                    i.e. "Wow! > "Wow exclamation"
    
    """
    
    df[column] = df[column].replace('!', ' exclamation ')
    df[column] = df[column].replace('?', ' question ')
    df[column] = df[column].replace('\'', ' quotation ')
    df[column] = df[column].replace('\"', ' quotation ')
    
    return df[column]

In [14]:
df['text'] = punctuation_to_features(df, 'text')

# Tokenize the data

In [15]:
nltk.download('punkt');

def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.
    
    Args:
        column: Pandas dataframe column (i.e. df['text']).
    
    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]
    
    """
    
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()] 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KAVITA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
df['tokenized'] = df.apply(lambda x: tokenize(x['text']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized
0,category,rating,label,text_,0,[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]"


# Stopword removal

In [17]:
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KAVITA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed. 
    
    Args:
        column: Pandas dataframe column of tokenized data from tokenize()
    
    Returns:
        tokens (list): Tokenized list with stopwords removed.
    
    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [19]:
df['stopwords_removed'] = df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized,stopwords_removed
0,category,rating,label,text_,0,[],[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]"


# Apply Porter stemming

In [20]:
def apply_stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.
    
    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.
    
    Returns:
        tokens (list): Tokenized list with words Porter stemmed.
    
    """
    
    stemmer = PorterStemmer() 
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [21]:
df['porter_stemmed'] = df.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized,stopwords_removed,porter_stemmed
0,category,rating,label,text_,0,[],[],[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]","[love, well, made, sturdi, comfort, i, love, veri, pretti]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]","[love, great, upgrad, origin, i, mine, coupl, year]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]","[thi, pillow, save, back, i, love, look, feel, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]","[miss, inform, use, great, product, price, i]"


# Rejoin words

In [22]:
def rejoin_words(tokenized_column):
    return ( " ".join(tokenized_column))

In [23]:
df['all_text'] = df.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)

In [24]:
df[['all_text']].head()

Unnamed: 0,all_text
0,
1,love well made sturdi comfort i love veri pretti
2,love great upgrad origin i mine coupl year
3,thi pillow save back i love look feel pillow
4,miss inform use great product price i


# Create training and test data

In [25]:
X = df['all_text']
y = df['target']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

# Run the model selection process

In [27]:
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',
                                                   objective='binary:logistic',
                                                   )})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC()})
classifiers.update({"MultinomialNB": MultinomialNB()})
classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})

In [28]:
df_models = pd.DataFrame(columns=['model', 'run_time', 'roc_auc', 'roc_auc_std'])

for key in classifiers:
    
    start_time = time.time()
    pipeline = Pipeline([("tfidf", TfidfVectorizer()), ("clf", classifiers[key] )])
    cv = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')

    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60,2)),
           'roc_auc': cv.mean(),
           'roc_auc_std': cv.std(),
    }
    
    df_models = df_models.append(row, ignore_index=True)
    
df_models = df_models.sort_values(by='roc_auc', ascending=False)

  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)


[LightGBM] [Info] Number of positive: 16173, number of negative: 16173
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.250084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 176207
[LightGBM] [Info] Number of data points in the train set: 32346, number of used features: 3262
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 16173, number of negative: 16173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.227935 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171382
[LightGBM] [Info] Number of data points in the train set: 32346, number of used features: 3141
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 16172, numbe

  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)


In [29]:
df_models

Unnamed: 0,model,run_time,roc_auc,roc_auc_std
11,SGDClassifier,0.11,0.925487,0.008782
1,CatBoostClassifier,37.34,0.922597,0.009725
2,LinearSVC,0.12,0.922388,0.01261
10,RidgeClassifier,0.12,0.922283,0.013242
4,LGBMClassifier,0.65,0.917487,0.010876
0,XGBClassifier,2.4,0.916505,0.010626
5,RandomForestClassifier,8.17,0.91157,0.013524
3,MultinomialNB,0.1,0.901796,0.019819
12,BaggingClassifier,11.37,0.857332,0.009445
8,AdaBoostClassifier,0.63,0.844253,0.020967


# Assess the selected model

In [30]:
bundled_pipeline = Pipeline([("tfidf", TfidfVectorizer()), 
                             ("clf", SGDClassifier())
                            ])
bundled_pipeline.fit(X_train, y_train)
y_pred = bundled_pipeline.predict(X_test)

In [31]:
accuracy_score = accuracy_score(y_test, y_pred)
precision_score = precision_score(y_test, y_pred)
recall_score = recall_score(y_test, y_pred)
roc_auc_score = roc_auc_score(y_test, y_pred)

In [32]:
print('Accuracy:', accuracy_score)
print('Precision:', precision_score)
print('Recall:', recall_score)
print('ROC/AUC:', roc_auc_score)

Accuracy: 0.8695795548227535
Precision: 0.8928755364806867
Recall: 0.8444552687124534
ROC/AUC: 0.8699750803451731
