### importing required libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string
from string import punctuation


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer


stop_words = set(stopwords.words('english'))


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping


from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

from warnings import filterwarnings
filterwarnings('ignore')

### Loading datasets

In [2]:
# Load dataset
dataset = pd.read_excel(f"NLP_data_scientist_test\\data\\Entity_sentiment_trainV2.xlsx")

In [3]:
# checking top 5 rows
dataset.head()

Unnamed: 0,Sentence,Entity,Sentiment
0,The website was very easy to use and my insura...,website,positive
1,The web sight was easy to understand and I got...,web sight,positive
2,Having filled in the application on-line I cou...,point,negative
3,After finding AXA was cheaper than my renewal ...,prices,positive
4,The quote was a reasonable price compared with...,insurances,positive


In [4]:
# checking dataset size(no of rows and columns)
dataset.shape

(5999, 3)

### checking datatypes

In [5]:
dataset.dtypes

Sentence     object
Entity       object
Sentiment    object
dtype: object

#### Checking NA values 

In [6]:
dataset.isnull().sum()

Sentence     0
Entity       0
Sentiment    0
dtype: int64

#### checking duplicate rows in dataset

In [7]:
duplicates = dataset[dataset.duplicated()]

In [8]:
duplicates.shape

(37, 3)

#### droping duplicates

In [9]:
dataset.drop_duplicates(inplace=True)

In [10]:
dataset.shape

(5962, 3)

#### checking no of categories in entity column

In [11]:
entity_categories_df = pd.DataFrame(dataset['Entity'].value_counts())

In [12]:
entity_categories_df.head()

Unnamed: 0,Entity
service,701
price,349
website,276
insurance,171
axa,144


In [13]:
entity_categories_df.shape

(748, 1)

In [14]:
# filtering categories in entity column with single frequency categories
other_cats = entity_categories_df[entity_categories_df['Entity'] == 1].index.tolist()

In [62]:
len(other_cats)

333

#### converting single value categories into other category

In [15]:
dataset['Entity'] = dataset['Entity'].apply(lambda x:"others" if x in other_cats else x)

In [16]:
dataset['Entity'].value_counts()

service               701
price                 349
others                333
website               276
insurance             171
                     ... 
axa policy              2
levels                  2
cause                   2
insurance companys      2
appointment             2
Name: Entity, Length: 416, dtype: int64

#### applying one hot encoding for entity column

In [17]:
entity_col = dataset[['Entity']]

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(entity_col)
entity_enc = enc.transform(entity_col)

In [18]:
entity_enc.shape

(5962, 416)

#### applying preprocessing techniques on sentence column

In [19]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [20]:
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    
    return text

In [21]:
sentence_col =np.array(dataset['Sentence'].apply(clean_text)).ravel()

In [22]:
sentence_col.shape

(5962,)

In [23]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [24]:
tf_vector = get_feature_vector(sentence_col)
sentence_col_tf = tf_vector.transform(sentence_col)

#### checking target class values

In [25]:
dataset['Sentiment'].value_counts()

positive    4075
negative    1887
Name: Sentiment, dtype: int64

#### seperating X and y variables

In [26]:
X = np.concatenate([sentence_col_tf.toarray(), entity_enc.toarray()], axis=1)
y = np.array(dataset['Sentiment']).ravel()

####  train and test splitting

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

### Model Building

In [28]:
def mul_models(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> pd.DataFrame:

    
    dfs = []
    
    models = [
              ('LogReg', LogisticRegression()), 
              ('SVM', SVC()), 
              ('GNB', GaussianNB()),
              ('SGD', SGDClassifier())
            ]
    
    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    target_names = ['positive', 'negative']
        
    for name, model in models:
        
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=12345)
        
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        
        results.append(cv_results)
        names.append(name)

        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        
        dfs.append(this_df)

    final = pd.concat(dfs, ignore_index=True)
    
    
    return final

In [29]:
final = mul_models(X_train, y_train, X_test, y_test)

LogReg
              precision    recall  f1-score   support

    positive       0.85      0.72      0.78       399
    negative       0.87      0.94      0.90       794

    accuracy                           0.87      1193
   macro avg       0.86      0.83      0.84      1193
weighted avg       0.86      0.87      0.86      1193

SVM
              precision    recall  f1-score   support

    positive       0.85      0.68      0.76       399
    negative       0.86      0.94      0.89       794

    accuracy                           0.85      1193
   macro avg       0.85      0.81      0.83      1193
weighted avg       0.85      0.85      0.85      1193

GNB
              precision    recall  f1-score   support

    positive       0.41      0.79      0.54       399
    negative       0.81      0.44      0.57       794

    accuracy                           0.56      1193
   macro avg       0.61      0.61      0.56      1193
weighted avg       0.68      0.56      0.56      1193

SGD


In [30]:
final.sort_values(by=['test_f1_weighted', 'test_accuracy'], ascending=False)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted,test_roc_auc,model
16,1.381199,0.030247,0.889937,0.888398,0.889937,0.888443,0.949135,SGD
1,1.277526,0.046858,0.885744,0.88642,0.885744,0.881449,0.949198,LogReg
3,1.352544,0.031271,0.871069,0.86897,0.871069,0.869319,0.931249,LogReg
15,1.472953,0.02858,0.866876,0.864653,0.866876,0.86507,0.928308,SGD
17,1.474567,0.031986,0.865828,0.863955,0.865828,0.864159,0.91968,SGD
0,1.2741,0.041606,0.86478,0.862495,0.86478,0.862945,0.931447,LogReg
6,28.328792,22.323003,0.865828,0.863984,0.865828,0.861471,0.94073,SVM
5,23.30754,21.712829,0.863732,0.861121,0.863732,0.860963,0.924019,SVM
18,1.367096,0.030917,0.856394,0.856538,0.856394,0.856465,0.920973,SGD
2,1.211885,0.046862,0.857442,0.856006,0.857442,0.853185,0.920417,LogReg


#### v. ANN

In [31]:
y_train_ann = pd.Series(y_train).replace({"positive":1, "negative":0})
y_test_ann = pd.Series(y_test).replace({"positive":1, "negative":0})

In [32]:
y_train_ann = np.asarray(y_train_ann).astype('float32').reshape((-1,1))
y_test_ann = np.asarray(y_test_ann).astype('float32').reshape((-1,1))

In [33]:
model = Sequential()
model.add(Dense(units=4799, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=2000, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=500, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation="sigmoid"))
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=2)

In [34]:
model.build(input_shape=X_train.shape)

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
module_wrapper (ModuleWrappe (4769, 4799)              28026160  
_________________________________________________________________
dropout (Dropout)            (4769, 4799)              0         
_________________________________________________________________
module_wrapper_1 (ModuleWrap (4769, 2000)              9600000   
_________________________________________________________________
dropout_1 (Dropout)          (4769, 2000)              0         
_________________________________________________________________
module_wrapper_2 (ModuleWrap (4769, 500)               1000500   
_________________________________________________________________
dropout_2 (Dropout)          (4769, 500)               0         
_________________________________________________________________
module_wrapper_3 (ModuleWrap (4769, 1)                 5

In [36]:
model.fit(
    x=X_train,
    y=y_train_ann,
    batch_size=100,
    epochs=100,
    validation_data=(X_test, y_test_ann),
    verbose=1,
    callbacks=early_stop
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 00003: early stopping


<tensorflow.python.keras.callbacks.History at 0x2af9629e310>

In [37]:
accr = model.evaluate(X_train, y_train_ann)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.015
  Accuracy: 0.997


In [38]:
accr = model.evaluate(X_test, y_test_ann)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.437
  Accuracy: 0.858


#### Building SGD model After compared all model performances

In [44]:
SGD_model = SGDClassifier()
SGD_model.fit(X_train, y_train)

y_pred_test_sgd = SGD_model.predict(X_test)

In [45]:
print("test data")
print(confusion_matrix(y_test, y_pred_test_sgd))

test data
[[316  83]
 [ 67 727]]


In [46]:
print("Test data: ", "\n", classification_report(y_test, y_pred_test_sgd))

Test data:  
               precision    recall  f1-score   support

    negative       0.83      0.79      0.81       399
    positive       0.90      0.92      0.91       794

    accuracy                           0.87      1193
   macro avg       0.86      0.85      0.86      1193
weighted avg       0.87      0.87      0.87      1193



#### Pickling model

In [50]:
import pickle

# Save the Modle to file in the current working directory

Pkl_Filename = "Pickle_SGD_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(SGD_model, file)

### performaing predictions on test data based on finalised model

In [51]:
test_ds = pd.read_excel(f"NLP_data_scientist_test\\data\\Entity_sentiment_testV2.xlsx")

In [52]:
test_ds.shape

(1290, 2)

In [53]:
test_ds.isnull().sum()

Sentence    0
Entity      0
dtype: int64

In [54]:
test_ds.drop_duplicates(inplace=True)
test_ds.dropna(axis=0, how='all')

# applying one hot encoding for entity column
test_ds['Entity'] = test_ds['Entity'].apply(lambda x:"others" if x in other_cats else x)
entity_test_enc = enc.transform(test_ds[['Entity']])

# tfidf for sentence column
sentence_test_col =np.array(test_ds['Sentence'].apply(clean_text)).ravel()
sentence_test_col_tf = tf_vector.transform(sentence_test_col)

# concatenating both arrays
test_feature = np.concatenate([sentence_test_col_tf.toarray(), entity_test_enc.toarray()], axis=1)

In [55]:
Pkl_Filename = "Pickle_SGD_Model.pkl"  

# Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Pickled_SGD_Model = pickle.load(file)

In [56]:
# using best model and checking performance on test dataset
test_prediction_lr = Pickled_SGD_Model.predict(test_feature)

print(test_prediction_lr)

['positive' 'positive' 'positive' ... 'negative' 'negative' 'negative']


In [63]:
pred_data = pd.DataFrame(data=test_prediction_lr, columns=['predictions'])

In [64]:
pred_data.to_excel("test_predictions.xlsx")