# Importing ML, Data Preprocessig and NLP Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
#library that contains punctuation
import string
string.punctuation

#defining function for tokenization
import re

#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')    ## Download it one time
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#!pip install wordninja
import wordninja

from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

# Importing dataset

In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [5]:
target_column_list = ['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability',
       'Polarity']

# NLP Data Preprocessing functions

In [6]:
#defining the function to remove punctuation
def remove_punctuation(text):
    text = re.sub(r'\d+', '', text)
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def tokenization(text):
    #tokens = re.split('W+',text)
    #re.split('\W+', test_string)
    return re.split('\W+', text)

def tokenization_advance(text):
    unique_list = []
    #temp_list = [unique_list.extend(wordninja.split(i)) if len(i) >= 13 else unique_list.extend(i) for i in text]
    for i in text:
        if(len(i) >= 15):
            unique_list.extend(wordninja.split(i))
        else:
            unique_list.append(i)
    #temp_list = [unique_list.extend(wordninja.split(i)) for i in text]
    return unique_list

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

def make_sentence(text):
    str_output = ''
    for i in text:
        str_output = str_output + i + ' ' 
    str_output = str_output.strip()
    return str_output

# Preprocessing Training and Test data

In [7]:
#storing the puntuation free text
train_df['clean_msg']= train_df['Review'].apply(lambda x:remove_punctuation(x))
#train_df.head()
train_df['msg_lower']= train_df['clean_msg'].apply(lambda x: x.lower())
#applying function to the column
train_df['msg_tokenied']= train_df['msg_lower'].apply(lambda x: tokenization(x))
#train_df['msg_tokenied']= [tokenization(i) for i in train_df['msg_lower']]
train_df['msg_tokenied_separate']= train_df['msg_tokenied'].apply(lambda x: tokenization_advance(x))
#applying the function
train_df['no_stopwords']= train_df['msg_tokenied_separate'].apply(lambda x:remove_stopwords(x))
train_df['msg_lemmatized']=train_df['no_stopwords'].apply(lambda x:lemmatizer(x))
train_df['sentence']=train_df['msg_lemmatized'].apply(lambda x:make_sentence(x))


In [8]:
#storing the puntuation free text
test_df['clean_msg']= test_df['Review'].apply(lambda x:remove_punctuation(x))
#test_df.head()
test_df['msg_lower']= test_df['clean_msg'].apply(lambda x: x.lower())
#applying function to the column
test_df['msg_tokenied']= test_df['msg_lower'].apply(lambda x: tokenization(x))
#test_df['msg_tokenied']= [tokenization(i) for i in test_df['msg_lower']]
test_df['msg_tokenied_separate']= test_df['msg_tokenied'].apply(lambda x: tokenization_advance(x))
#applying the function
test_df['no_stopwords']= test_df['msg_tokenied_separate'].apply(lambda x:remove_stopwords(x))
test_df['msg_lemmatized']=test_df['no_stopwords'].apply(lambda x:lemmatizer(x))
test_df['sentence']=test_df['msg_lemmatized'].apply(lambda x:make_sentence(x))

# Model Building

### Model 1 - Logistic Regression

In [11]:
final_df = pd.DataFrame()

In [12]:
for i in target_column_list:
    vectorizer = TfidfVectorizer(max_features=2000, sublinear_tf= False, norm ='l1', ngram_range = (1,2))
    processed_features = vectorizer.fit_transform(train_df['sentence']).toarray()
    #processed_features_test = vectorizer.fit_transform(test_df['sentence']).toarray()
    labels = train_df[i]

    X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.3, random_state=0)

    LR_model = LogisticRegression(solver='saga', penalty='none', class_weight= 'balanced', random_state=0)
    LR_model.fit(X_train, y_train)
    y_train_predict = LR_model.predict(X_train)
    model_score = LR_model.score(X_train, y_train)                      ## Accuracy
    print(model_score)
    print(metrics.confusion_matrix(y_train, y_train_predict))          ## confusion_matrix
    print(metrics.classification_report(y_train, y_train_predict))     ## classification_report

    y_test_predict = LR_model.predict(X_test)
    model_score = LR_model.score(X_test, y_test)                      ## Accuracy
    print(model_score)
    print(metrics.confusion_matrix(y_test, y_test_predict))          ## confusion_matrix
    print(metrics.classification_report(y_test, y_test_predict))     ## classification_report

    LR_model = LogisticRegression(solver='saga', penalty='none', class_weight= 'balanced', random_state=0)
    LR_model.fit(processed_features, labels)

    processed_features_test = vectorizer.transform(test_df['sentence']).toarray()
    labels = test_df[i]

    y_test_predict = LR_model.predict(processed_features_test)

    probs = LR_model.predict_proba(processed_features_test)
    probs = probs[:, 1]
    final_df[i] = probs
    

0.9660069848661234
[[3951  146]
 [   0  198]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      4097
           1       0.58      1.00      0.73       198

    accuracy                           0.97      4295
   macro avg       0.79      0.98      0.86      4295
weighted avg       0.98      0.97      0.97      4295

0.9163498098859315
[[1644  121]
 [  33   43]]
              precision    recall  f1-score   support

           0       0.98      0.93      0.96      1765
           1       0.26      0.57      0.36        76

    accuracy                           0.92      1841
   macro avg       0.62      0.75      0.66      1841
weighted avg       0.95      0.92      0.93      1841

0.9860302677532014
[[4114   60]
 [   0  121]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4174
           1       0.67      1.00      0.80       121

    accuracy                           0.99 

0.9685681024447031
[[ 955    4]
 [ 131 3205]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.93       959
           1       1.00      0.96      0.98      3336

    accuracy                           0.97      4295
   macro avg       0.94      0.98      0.96      4295
weighted avg       0.97      0.97      0.97      4295

0.8756110809342749
[[ 331   76]
 [ 153 1281]]
              precision    recall  f1-score   support

           0       0.68      0.81      0.74       407
           1       0.94      0.89      0.92      1434

    accuracy                           0.88      1841
   macro avg       0.81      0.85      0.83      1841
weighted avg       0.89      0.88      0.88      1841



In [13]:
final_df.head()

Unnamed: 0,Components,Delivery and Customer Support,Design and Aesthetics,Dimensions,Features,Functionality,Installation,Material,Price,Quality,Usability,Polarity
0,0.983101,0.067344,0.052661,0.080811,0.003175,0.003264,0.018721,0.7415637,0.022744,0.742178,0.006829,0.002306
1,0.000947,0.309293,0.003103,0.003719,0.00039,0.999141,0.000208,2.458514e-05,0.020265,0.069112,0.150672,0.997559
2,4e-05,0.00024,0.000293,0.000217,5.1e-05,0.041046,0.008762,3.634202e-12,0.01081,0.00934,1.0,0.999996
3,0.970305,0.048227,0.151059,0.14621,0.000449,0.999884,0.015313,2.830082e-05,0.022688,0.365146,0.232309,0.962767
4,2e-06,0.168185,0.001752,0.022483,0.318097,0.995071,0.041705,7.334437e-07,0.000172,0.998302,0.002306,1.0


In [14]:
final_df.to_csv('final_df_v10.csv', index=False)