# Natural Language Processing

## Importing the libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
 
projDir = 'C:/Users/neelkanth mehta/Documents/Udemy/udemy-machine_learning/Section 34 - NLP'
datafile= os.path.join(projDir,'Restaurant_Reviews.tsv')
 
 
'''Loading and preprocessing dataset'''
import re
import nltk
 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
 
 
def generate_corpus(df:pd.DataFrame, verbose=False) -> pd.DataFrame:
    '''
    Generates sparse matrix of corpus of words from the given reviews
    and returns X and y
    
    inputs:
        df -> pd.DataFrame() object containing reviews and likes
        verbose --> bool whether would like to print the corpus sample
        
    outputs:
        X -> np.array() object, which is a sparse matrix of corpus
        y -> np.array() object of likes column in the original dataframe
    '''
    corpus = []
    
    # processing text
    for i in range(0, df.shape[0]):
        review = re.sub('[^a-zA-Z]', ' ', df.loc[i,'Review'])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus.append(review)
    
    # printing the output
    if verbose == True:
        print('corpus  \n')
        print(pd.Series(corpus).sample(10))
        print('')
    
    # Fitting and transforming the corpus with CountVectorizer()
    count_vect = CountVectorizer(max_features=1500)
    X = pd.DataFrame(count_vect.fit_transform(corpus).toarray())
    y = df['Liked']
    
    return X, y
    
 
def split_dataset(X, y, test_size=0.2, shuffle=True, random_state=0):
    '''
    Uses train_test_split function of sklearn library to split the dataset
    
    inputs:
        X -> np.array() object, which is a sparse matrix of corpus 
        y -> np.array() object of likes column in the original dataframe
        test_size -> float between 0 and 1
        shuffle -> boolean object
        stratisfy -> feature to base stratas on    
        random_state -> integer
    
    outputs: X_train, y_train, X_test, y_test np.array() objects
    '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=shuffle, random_state=random_state)
    return X_train, y_train, X_test, y_test
 
 
'''Fitting model and scoring'''
def classification_model(X_train, y_train, X_test, y_test, model='LogisticRegression', save=False, graph=False, metrics:list=['accuracy_score'], **kwargs): # , rs=0
    '''
    Fits, saves, predicts, outputs performance and produces graphs for select classification model out of 'LogReg', 'KNN', 'SVC', 'SVC_rbf', 'NaiveBayes', 'tree' or 'forest'
 
    inputs:
        X_train -> nd array. X train values
        y_train -> 1d array. y train values
        X_test -> nd array. X test values
        y_test -> 1d array. y test values.
        model -> specify a model, LogReg by default. Enter string value from the above choice of models.
        rs -> integer, 0 by default. Enter any integer value
        save -> boolean, False by default. Enter True if you want to pickle the model
        graph -> boolean, False by default. enter True if you want a graph
 
    output:
        perf_stats -> pd.DataFrame() object of selected performance statistics
        cm -> confusion matrix
    '''
 
    # Instantiating model
    
    mod = eval(model)(**kwargs)
    
    # Fitting models
    mod.fit(X, y)
 
    # Pikling model
    if save == True:
        joblib.dump(value=mod, filename=os.path.join(projDir, str(model)+'.pickle'))
 
    # Model evaluation
    y_pred = mod.predict(X_test)
    perf = {i: eval(i)(y_test, y_pred) for i in metrics}
    perf_stats = pd.Series(list(perf.values()), index=list(perf.keys()), name=model)
 
    cm = confusion_matrix(y_test, y_pred)
    cm = np.vstack((cm[1][::-1], cm[0][::-1]))
 
    return perf_stats, cm
 
 
if __name__ == '__main__':
 
    # Data prep
    dataset = pd.read_csv(datafile, delimiter='\t', quoting=3)
    X, y = generate_corpus(df=dataset)
    X_train, y_train, X_test, y_test = split_dataset(X, y)
 
    # Classification model execution
    logreg, _ = classification_model(X_train, y_train, X_test, y_test, model='LogisticRegression', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'], penalty='l1', C=0.2, solver='liblinear', random_state=0, save=True)
 
    KNN, _ = classification_model(X_train, y_train, X_test, y_test, model='KNeighborsClassifier', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'], weights='distance', p=2)
 
    svc, _ = classification_model(X_train, y_train, X_test, y_test, model='SVC', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'], kernel='linear', random_state=0, gamma='auto', C=0.1)
 
    svc_r, _ = classification_model(X_train, y_train, X_test, y_test, model='SVC', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'], kernel='rbf', random_state=0, gamma='auto', C=0.1)
 
    GNB, _ = classification_model(X_train, y_train, X_test, y_test, model='GaussianNB', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'])
 
    tree, _ = classification_model(X_train, y_train, X_test, y_test, model='DecisionTreeClassifier', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'], criterion='entropy', max_depth=5, random_state=0)
 
    forest, _ = classification_model(X_train, y_train, X_test, y_test, model='RandomForestClassifier', metrics=['accuracy_score', 'precision_score', 'recall_score', 'f1_score'], n_estimators=22, criterion='entropy', random_state=0, max_depth=5)
 
    print(pd.concat([logreg, KNN, svc, svc_r, GNB, tree, forest], axis=1).T)