In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Amazon Fine Food Review Analysis

*The Amazon fine food reviews dataset consist of reviews of fine food from Amazon*
1. Number of reviews: 568,454 reviews*
1. Number of Users : 256,059 users
1. Number of products : 74,258 products
1. Time span of taking reviews: Oct 1999 - Oct 2012
1. Number of attributes/column in Data: 10

*Attributes: Information:*
1. Id : Row Id
1. ProductId : Unique identifier for the product (74258 unique values )
2. UserId : Unqiue identifier for the user (256059 unique values)
3. ProfileName: Profile name of the user (218418 unique values)
4. HelpfulnessNumerator : Number of users who found the review helpful
5. HelpfulnessDenominator : Number of users who indicated whether they found the review helpful or not
6. Score : Rating between 1 and 5
7. Time : Timestamp for the review
8. Summary : Brief summary of the review (295744 unique values)
9. Text : Text of the review (393579 unique values ) (the most useful information for predicting postive or negative) 


**Objecive**: *Given a review ,determine whether  a review is positive (Rating of 4 or 5) or negative(Rating of 1 or 2) ?*

Q). **How to determine if a review is positive or negative ?**

*Ans). We could use the Score/rating. A rating of 4 or 5 could be considered as a positive review and A  rating of 1 or 2 could be negative reviews. A rating of 3 neutral and can be ignored. This is the approximation and proxy way of determinig the polarity (positivity/negativity) of review.*

# *Loading the data *
**The dataset is available in two forms** 
1. .csv file
2.  SqLite Database

**In order to load the database, We have used the SqLite Database as it easier to query tha data and visualise the data suffuciently.**
*Here as we only want to get the global sentiment of the recommendation(Positve or Negative), we will purposefully ignore all the scores equal to 3 ,If the Score is above 3 ,then the recommendation or review will be set to "positive". Otherwise ,it will be set to "negative".*


# Importing Important Library

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score,auc

import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os 
#mterics 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

# Reading  Data

In [None]:
#Using the Sqlite table to read data 
con = sqlite3.connect('/kaggle/input/amazon-fine-food-reviews/database.sqlite')
#Filtering only Positve and Negative Reviews i.e
# not taking into consideration those reviews with Score=3

filtered_data =pd.read_sql_query("""SELECT * FROM Reviews WHERE SCORE!=3""",con)

# Give reviews with score >3  a positive rating "1" and review with score <3 a negative rating "0"
def  partition(x):
    if x<3:
        return 0
    return 1
        
#Changing reviews with score<3 to be negative and score > 3 to be positive
actualScore=filtered_data['Score']
Positive_Negative=actualScore.map(partition)
filtered_data['Score']=Positive_Negative
print("No. of data points in Dataset:",filtered_data.shape)
filtered_data.head(3)

# Exploratory Data Analysis
### Data Cleaning : Deduplication { The most important part of cleaning data}
*It was observed (as shown in table below ) that the reviews data  had manu duplicates entries.Hence it was neccessary to remove the deduplication in order to get unbaised results for the Analysis of the data .Following is the example given below*

In [None]:
display=pd.read_sql_query("""SELECT * FROM Reviews WHERE Score!=3 AND UserID='AR5J8UI46CURR' ORDER BY 
ProductId""",con)
display

**As it can be seen above the same user has multiple reviews of the same values for** **HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary and Text and on doing analysis it was found that**
***ProductId=B000HDOPZG was Loacker Quadratini Vanilla Wafer Cookies, 8.82-Ounce Packages (Pack of 8)***

***ProductId=B000HDL1RQ was Loacker Quadratini Lemon Wafer Cookies, 8.82-Ounce Packages (Pack of 8) and so on***

***It was inferred after analysis that reviews with same parameters other than ProductId belonged to the same product just having different flavour or quantity. Hence in order to reduce redundancy it was decided to eliminate the rows having same parameters.***

***The method used for the same was that we first sort the data according to ProductId and then just keep the first similar product review and delelte the others.
for eg. in the above just the review for ProductId=B000HDL1RQ remains.
This method ensures that there is only one representative for each product and deduplication without sorting would lead to possibility of different representatives still existing for the same product***

In [None]:
# Sorting data sccording to ProducID  in ascending order
sorted_data=filtered_data.sort_values('ProductId',axis=0,ascending=True,inplace=False,kind='quicksort',na_position='last')

In [None]:
#Deduplication of entries
final_data=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first',inplace=False)
final_data.shape

In [None]:
#Checking to see how much % of data still remains
print('The total data remain after cleaning data ',(final_data['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100)

***Observation***:It was seen that in 2 rows given below the value of HelpfullnessNumerator is Greater than HelpfullnessDenominator which is not practically hence such rows are too removed from dataset

In [None]:
display=pd.read_sql_query("""SELECT * FROM Reviews WHERE Score!=3 AND Id=44737 OR Id=64422
ORDER BY ProductID""",con)
display.head()

In [None]:
final_data=final_data[final_data['HelpfulnessNumerator']<=final_data['HelpfulnessDenominator']]


In [None]:
# before starting the next phase of text preprocessing lets see the no. of entries left
print(final_data.shape)
#How many positive and negative reviews are present in our dataset?
final_data['Score'].value_counts()

# ***Text Preprocessing***
**Now we have finished deduplication. Now our data requires some preprocessing before we go on futher anlysis and make the prediction model**
*Hence in the preprocessing phase we do the following steps given below*
1. Begin by removing html tags
1. Removing any punctution or limited set of special character:like ,or . or # etc
1. Check the words is made up of english letters and is not alpha-numeric
1. Check to see if the length of the words is greater than 2 (as it was research that there is no adjective in 2 letter)
1. Convert the words to lowercase
1. Remove stopwords
1. Snowball stemming the word(it is observed that Snoball stemming is better that Porter stemming)

***After this we will collect the words and will use to describe positive and negative reviews.***


In [None]:
# printing some random reviews
sent_0 = final_data['Text'].values[6]
print(sent_0)
print("="*50)

sent_1000 = final_data['Text'].values[1000]
print(sent_1000)
print("="*50)

sent_1500 = final_data['Text'].values[1500]
print(sent_1500)
print("="*50)

sent_4900 = final_data['Text'].values[4900]
print(sent_4900)
print("="*50)

In [None]:
import re    #Tutorial about Python regular expressions: https://pymotw.com/2/re/
# remove urls from text python : *https://stackoverflow.com/a/40823105/4084039*
sent_0=re.sub(r'https\S+','',sent_0)
sent_1000=re.sub(r'https\S+','',sent_1000)
sent_1500=re.sub(r'https\S+','',sent_1500)
sent_4900=re.sub(r'http\S+', '', sent_4900)

print(sent_0)

In [None]:
#SENTENCES Containing HTML tags
import re
i=0;
for sentence in final_data['Text'].values:
    if (len(re.findall('<.*?>',sentence))):
        print(i)
        print(sentence)
        break;
    i=i+1

In [None]:
#stop=set(stopwords.words('english')) #set of stopwords
# we are removing the words from the stop words list: 'no', 'nor', 'not' for semantic meanig in bigrams and trigram
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',  'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',  'weren', \
            'won', "won't", 'wouldn'])
stemming = nltk.stem.SnowballStemmer('english') #intialsing the snowball stemmer

def cleanhtml(sentence): # function to remove html tags from words
    clean = re.compile('<.*?>')
    cleantext=re.sub(clean,' ',sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleanp=re.sub(r'[?|!|\'|"|#]',r'',sentence)
    clean_punc=re.sub(r'[.|,|)|(|\|/]',r' ',cleanp)
    return clean_punc

print('stop words are :',stopwords)
print("*********************")
print('base word for tasty is :',stemming.stem('tasty'))

In [None]:
# this code takes a while to run as it needs to run on 500k sentences.
if not os.path.isfile('finalsqlite'):
    final_string=[]
    all_positive_words=[] # store words from +ve review
    all_negative_words=[] # store words from -ve review
    for i , sentence in enumerate(tqdm(final_data['Text'].values)):
        filter_sentences =[]
        sent = cleanhtml(sentence) #remove HTML tags
        for words in sent.split():
            # we have used cleanpunc(w).split(), one more split function here because consider w="abc.def", cleanpunc(w) will return "abc def"
            # if we dont use .split() function then we will be considring "abc def" as a single word, but if you use .split() function we will get "abc", "def"
            for cleaned_words in cleanpunc(words).split():
                if ((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                    if(cleaned_words.lower() not in stopwords):
                        s=(stemming.stem(cleaned_words.lower())).encode('utf8')
                        filter_sentences.append(s)
                        if (final_data['Score'].values)[i]==1:
                            all_positive_words.append(s)  #list of all words used to describe positive reviews
                        if (final_data['Score'].values)[i]==0:
                            all_negative_words.append(s)  #list of all words used to describe negative reviews 
                    else:
                        continue
                else:
                    continue
        str1=b" ".join(filter_sentences) #final string of cleaned words
        #print("***********************************************************************")
        final_string.append(str1)
        i+=1

In [None]:
#adding a column of CleanedText which displays the data after pre-processing of the review 
final_data['Cleaned_text']=final_string
final_data['Cleaned_text']=final_data['Cleaned_text'].str.decode('utf-8')

In [None]:
#Store the final table into an SQLITE table for future
connection=sqlite3.connect('final.sqlite')
c=connection.cursor
connection.text_factory=str
final_data.to_sql('Reviews',connection,schema=None,if_exists='replace',index=True,index_label=True,chunksize=None,dtype=None)
connection.close()

# Sorting dataset based on 'Time' feature

In [None]:
# converting time in unit=sec
final_data['Time']=pd.to_datetime(final_data['Time'],unit='s')
final_data.head()

In [None]:
final_data=final_data.sort_values(by='Time')

1. Apply Knn(brute force version) on these feature sets
 * Review text, preprocessed one converted into vectors using (BOW)
 * Review text, preprocessed one converted into vectors using (TFIDF)
 * Review text, preprocessed one converted into vectors using (AVG W2v)

2. Apply Knn(kd tree version) on these feature sets:    
sklearn implementation of kd-tree accepts only dense matrices, you need to convert the sparse matrices of CountVectorizer/TfidfVectorizer into dense matices. You can convert sparse matrices to dense using .toarray() attribute. 
    * Review text, preprocessed one converted into vectors using (BOW) but with restriction on maximum features generated.
            count_vect = CountVectorizer(min_df=10, max_features=500) 
            count_vect.fit(preprocessed_reviews)
            
    * Review text, preprocessed one converted into vectors using (TFIDF) but with restriction on maximum features generated.
                tf_idf_vect = TfidfVectorizer(min_df=10, max_features=500)
                tf_idf_vect.fit(preprocessed_reviews)
            
    * Review text, preprocessed one converted into vectors using (AVG W2v)
    * Review text, preprocessed one converted into vectors using (TFIDF W2v)

3. The hyper paramter tuning(find best K)
    * Find the best hyper parameter which will give the maximum AUC value
    * Find the best hyper paramter using k-fold cross validation or simple cross validation data
    * Use gridsearch cv or randomsearch cv or you can also write your own for loops to do this task of hyperparameter tuning

4. Representation of results
    * You need to plot the performance of model both on train data and cross validation data for each hyper parameter, like shown in the figure
    * Once after you found the best hyper parameter, you need to train your model with it, and find the AUC on test data and plot the ROC curve on both train and test.
    * Along with plotting ROC curve, you need to print the confusion matrix with predicted and original labels of test data points
5. Conclusion
    * summarize the results at the end of the notebook, summarize it in the table format.


1. There will be an issue of data-leakage if you vectorize the entire data and then split it into train/cv/test.
2. To avoid the issue of data-leakag, make sure to split your data first and then vectorize it.
3. While vectorizing your data, apply the method fit_transform() on you train data, and apply the method transform() on cv/test data.

In [None]:
# Counting no. of 0 and 1 in dataset
final_data['Score'].value_counts()

In [None]:
# taking equal no. of negative and postive data point
data_pos = final_data[final_data["Score"] == 1].sample(n = 60000)
data_neg = final_data[final_data['Score'] == 0].sample(n = 57000)
final=pd.concat([data_pos,data_neg])
final.shape

In [None]:
y=final['Score']
X=final['Cleaned_text']
print("Shape of X",X.shape)
print("Shape of y",y.shape)

In [None]:
from sklearn import preprocessing

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data in train ,test and Cv dataset
x, X_test, y, y_test = train_test_split(X, y, test_size=.30, random_state=0)
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=.30, random_state=0)
print('Shape of X_train is :',X_train.shape)
print('Shape of y_train is :',y_train.shape)
print("****"*6)
print('Shape of X_Cv is :',X_cv.shape)
print('Shape of y_cv is :',y_cv.shape)
print("****"*6)
print('Shape of X_test is :',X_test.shape)
print('Shape of y_test is :',y_test.shape)


# Applying KNN brute force on BOW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Converting text into vector Using BOW
coun_vect=CountVectorizer()
coun_vect.fit(X_train) # fit has to happen only on train data

# we use the fiited Countvectorizer to convert the text into vectors
X_train_bow=coun_vect.transform(X_train)
X_cv_bow=coun_vect.transform(X_cv)
X_test_bow=coun_vect.transform(X_test)

vectorizer = CountVectorizer()

vectorizer.fit_transform(corpus)

pd.DataFrame(data = vectorizer.fit_transform(corpus).toarray(),
             columns = vectorizer.get_feature_names(), 
             index = ['v1', 'v2', 'v3']).T.to_dict()

vectorizer = CountVectorizer(ngram_range=(2,2))

vectorizer.fit_transform(corpus)

pd.DataFrame(data = vectorizer.fit_transform(corpus).toarray(), 
             columns = vectorizer.get_feature_names(), 
             index = ['v1', 'v2', 'v3']).T.to_dict()*        

In [None]:
print("After vectorizations")
print(X_train_bow.shape, y_train.shape)
print(X_cv_bow.shape, y_cv.shape)
print(X_test_bow.shape, y_test.shape)
print("="*100)



print("YOU SHOULD NOT DO SOMETHING LIKE THIS")
#vectorizer = CountVectorizer()
#x_train_bow = vectorizer.fit_transform(X_train)
#x_cv_bow = vectorizer.fit_transform(X_cv)
#x_test_bow = vectorizer.fit_transform(X_test)

#print(x_train_bow.shape, y_train.shape)
#print(x_cv_bow.shape, y_cv.shape)
#print(x_test_bow.shape, y_test.shape)

print("NOTE: THE NUMBER OF COLUMNS IN EACH OF THE VECTOR WONT BE SAME")

<pre>
<h2> <font color='red'>YOU SHOULD NOT DO LIKE THIS </font></h2>
1.  <font color='red'>THE VOCABULARY SHOULD BUILT ONLY WITH THE WORDS OF TRAIN DATA</font>
    vectorizer = CountVectorizer()
    x_train_bow = vectorizer.fit_transform(X_train)
    x_cv_bow = vectorizer.fit_transform(X_cv)
    x_test_bow = vectorizer.fit_transform(X_test)
    
2.  <font color='red'>DATA LEAKAGE PROBLEM: IF WE DO LIKE THIS WE ARE LOOKING AT THE TEST DATA BEFORE MODELING</font>
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transfomr(X)
    X_train, X_test, y_train, y_test = train_test_split(X_bow, Y, test_size=0.33)
   
3. <font color='red'>YOU SHOULD PASS THE PROBABILITY SCORES NOT THE PREDICTED VALUES</font>
    y_pred =  neigh.predict(X)
    roc_auc_score(y_ture,y_pred)


</pre>

# Applying KNN

## Hyper Parameter tuning 


In [None]:
### Hyper Parameter tuning Method 1: Simple 'for' loop
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score 
import matplotlib.pyplot as plt
train_auc=[]
cv_auc=[]
k=list(range(1,30,2))
for i in tqdm(k):
    knn=KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='brute')
    knn.fit(X_train_bow,y_train)
    
    y_train_pred=knn.predict_proba(X_train_bow)[:,1]
    y_cv_pred=knn.predict_proba(X_cv_bow)[:,1]
    
    train_auc.append(roc_auc_score(y_train,y_train_pred))
    cv_auc.append(roc_auc_score(y_cv,y_cv_pred))
    
plt.plot(k,train_auc,label='Train_auc')
plt.plot(k,cv_auc,label='Cv_auc')
plt.legend()
plt.xlabel("K: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.show()


In [None]:
# Hypermater tunning using 10-fold CV
k=list(range(1,30,2))
#empty list that will hold cv scores
cv_scores=[]
# perform 10-fold CV
for i in tqdm(k):
    knn=KNeighborsClassifier(n_neighbors=i,algorithm='brute')
    score=cross_val_score(knn,X_train_bow,y_train,cv=10,scoring='accuracy',n_jobs=-1)
    cv_scores.append(score.mean())

    
#determining best k
MSE=[1- x for x in cv_scores]
optimal_k=k[MSE.index(min(MSE))]
print("_" * 101)
print("Optimal number of neighbors: ", optimal_k)
print("_" * 101)
print("Missclassification error for each k values: ", np.round(MSE, 3))
print("_" * 101)

# plot error vs k 
plt.plot(k,MSE)
plt.title("Number of neighbors and error")
plt.xlabel("Number of neighbors")
plt.ylabel("Missclassification error")
plt.grid()
plt.show()

In [None]:
# ============================== KNN with k = optimal_k ===============================================
# learning model k = optimal
knn_optimal_model=KNeighborsClassifier(n_neighbors=optimal_k,algorithm='brute')
knn_optimal_model.fit(X_train_bow,y_train)
y_test_predict=knn_optimal_model.predict_proba(X_test_bow)[:,1]
fpr1,tpr1,threshold1=metrics.roc_curve(y_test,y_test_predict)

y_tr_pred=knn_optimal_model.predict_proba(X_train_bow)[:,1]
fpr2,tpr2,threshold2=metrics.roc_curve(y_train,y_train_pred)

fig=plt.figure()
ax=plt.subplot(111)
ax.plot(fpr1, tpr1, label='Test ROC ,auc='+str(roc_auc_score(y_test,y_test_predict)))
ax.plot(fpr2, tpr2, label='Train ROC ,auc='+str(roc_auc_score(y_train,y_train_pred)))
plt.title('ROC')
plt.xlabel('FPR')
plt.ylabel('TPR')
ax.legend()
plt.show()


In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix
knn_model=KNeighborsClassifier(n_neighbors=optimal_k,weights='uniform',algorithm='brute',metric='cosine')
knn_model.fit(X_train_bow,y_train)
y_test_pred=knn_model.predict(X_test_bow)

import seaborn as sns
conf_mat=confusion_matrix(y_test,y_test_pred)
class_label=['negative','positive']
df=pd.DataFrame(conf_mat,index=class_label,columns=class_label)
sns.heatmap(df,annot=True,fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Applying KNN brute force on TF-idf vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Conveting text into vectors using Tfidfvectorizer
tfidf_count_vect=TfidfVectorizer(ngram_range=(1,2))
tfidf_count_vect.fit(X_train)
# we use the fited Countvectorizer to convert the text into vectors
X_train_tfidf=tfidf_count_vect.transform(X_train)
X_cv_tfidf=tfidf_count_vect.transform(X_cv)
X_test_tfidf=tfidf_count_vect.transform(X_test)

print("After vectorizations")
print(X_train_tfidf.shape, y_train.shape)
print(X_cv_tfidf.shape, y_cv.shape)
print(X_test_tfidf.shape, y_test.shape)
print("="*100)

In [None]:
### Hyper Parameter tuning Method 1: Simple 'for' loop
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
train_auc=[]
cv_auc=[]
k=list(range(1,30,2))
for i in tqdm(k):
    knn_tfidf=KNeighborsClassifier(n_neighbors=i,algorithm='brute',weights='uniform')
    knn_tfidf.fit(X_train_tfidf,y_train)
    
    y_cv_pred=knn_tfidf.predict_proba(X_cv_tfidf)[:,1]
    cv_auc.append(roc_auc_score(y_cv,y_cv_pred))
    
    y_tr_pred=knn_tfidf.predict_proba(X_train_tfidf)[:,1]
    train_auc.append(roc_auc_score(y_train,y_tr_pred))
    
plt.plot(k,train_auc,label='Train_auc')
plt.plot(k,cv_auc,label='cv_auc')
plt.legend()
plt.xlabel("K: hyperparameter")
plt.ylabel("AUC")
plt.title("ERROR PLOTS")
plt.show()

    

In [None]:
# Hypermater tunning using 10-fold CV
k=list(range(1,30,2))
#empty list that will hold cv scores
cv_scores=[]
# perform 10-fold CV
for i in tqdm(k):
    knn=KNeighborsClassifier(n_neighbors=i,algorithm='brute')
    score=cross_val_score(knn,X_train_tfidf,y_train,cv=10,scoring='accuracy',n_jobs=-1)
    cv_scores.append(score.mean())
#determining best k
MSE=[1- x for x in cv_scores]
optimal_k=k[MSE.index(min(MSE))]
print("_" * 101)
print("Optimal number of neighbors: ", optimal_k)
print("_" * 101)
print("Missclassification error for each k values: ", np.round(MSE, 3))
print("_" * 101)

# plot error vs k 
plt.plot(k,MSE)
plt.title("Number of neighbors and error")
plt.xlabel("Number of neighbors")
plt.ylabel("Missclassification error")
plt.grid()
plt.show()

In [None]:
# ============================== KNN with k = optimal_k ===============================================
# learning model k = optimal
knn_tfidf_model=KNeighborsClassifier(n_neighbors=optimal_k,algorithm='brute')
knn_tfidf_model.fit(X_train_tfidf,y_train)

y_test_predict=knn_tfidf_model.predict_proba(X_test_tfidf)[:,1]
fpr1,tpr1,threshold1=metrics.roc_curve(y_test,y_test_predict)

y_tr_pred=knn_tfidf_model.predict_proba(X_train_tfidf)[:,1]
fpr2,tpr2,threshold2=metrics.roc_curve(y_train,y_train_pred)

fig=plt.figure()
ax=plt.subplot(111)
ax.plot(fpr1, tpr1, label='Test ROC ,auc='+str(roc_auc_score(y_test,y_test_predict)))
ax.plot(fpr2, tpr2, label='Train ROC ,auc='+str(roc_auc_score(y_train,y_train_pred)))
plt.title('ROC')
plt.xlabel('FPR')
plt.ylabel('TPR')
ax.legend()
plt.show()

In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix
knn_tfidf_model=KNeighborsClassifier(n_neighbors=optimal_k,weights='uniform',algorithm='brute',metric='cosine')
knn_tfidf_model.fit(X_train_tfidf,y_train)
y_test_pred=knn_tfidf_model.predict(X_test_tfidf)

import seaborn as sns
conf_mat=confusion_matrix(y_test,y_test_pred)
class_label=['negative','positive']
df=pd.DataFrame(conf_mat,index=class_label,columns=class_label)
sns.heatmap(df,annot=True,fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Applying KNN brute force on Avg_w2v 

In [None]:
# average Wordvec
# Computer average Word2vec for each review
import gensim
from gensim.models import Word2Vec  # it create dense vector
from gensim.models import KeyedVectors


#word2vec for train
list_sentences_train=[]
for sentence in X_train:
    list_sentences_train.append(sentence.split())

w2v_model=gensim.models.Word2Vec(list_sentences_train,min_count=5,size=50, workers=4)
w2v_words=list(w2v_model.wv.vocab)
sen_vect_train=[]
for sent in tqdm(list_sentences_train):
    sent_vec=np.zeros(50)
    cnt_words=0
    for word  in sent:
        if word in w2v_words:
            vec=w2v_model.wv[word]
            sent_vec+=vec
            cnt_words+=1
    if cnt_words!=0:
        sent_vec/=cnt_words
    sen_vect_train.append(sent_vec)
print(len(sen_vect_train))

print(len(sen_vect_train[0]))
                


In [None]:
#word2vec for Cv data
list_sentences_cv=[]
for sentence in X_cv:
    list_sentences_cv.append(sentence.split())

w2v_model=gensim.models.Word2Vec(list_sentences_cv,min_count=5,size=50, workers=4)
w2v_words=list(w2v_model.wv.vocab)
sen_vect_cv=[]
for sent in tqdm(list_sentences_cv):
    sent_vec=np.zeros(50)
    cnt_words=0
    for word  in sent:
        if word in w2v_words:
            vec=w2v_model.wv[word]
            sent_vec+=vec
            cnt_words+=1
    if cnt_words!=0:
        sent_vec/=cnt_words
    sen_vect_cv.append(sent_vec)
print(len(sen_vect_cv))
print(len(sen_vect_cv[0]))

In [None]:
#word2vec for test data
list_sentences_test=[]
for sentence in X_test:
    list_sentences_test.append(sentence.split())

w2v_model=gensim.models.Word2Vec(list_sentences_test,min_count=5,size=50, workers=4)
w2v_words=list(w2v_model.wv.vocab)
sen_vect_test=[]
for sent in tqdm(list_sentences_test):
    sent_vec=np.zeros(50)
    cnt_words=0
    for word  in sent:
        if word in w2v_words:
            vec=w2v_model.wv[word]
            sent_vec+=vec
            cnt_words+=1
    if cnt_words!=0:
        sent_vec/=cnt_words
    sen_vect_test.append(sent_vec)
print(len(sen_vect_test))
print(len(sen_vect_test[0]))

In [None]:
# apply knn on avg_w2vec
X_tr=sen_vect_train
X_cv=sen_vect_cv
X_test=sen_vect_test
auc_cv=[]
auc_train=[]
k=list(range(1,30,2))
for i in tqdm(k):
    knn_avg_w2vec=KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='brute')
    knn_avg_w2vec.fit(X_tr,y_train)
    
    y_cv_pred=knn_avg_w2vec.predict_proba(X_cv)[:,1]
    auc_cv.append(roc_auc_score(y_cv,y_cv_pred))
    
    y_tr_pred=knn_avg_w2vec.predict_proba(X_tr)[:,1]
    auc_train.append(roc_auc_score(y_train,y_tr_pred))
fig=plt.figure()
ax=plt.subplot(111)
ax.plot(k, auc_train, label='AUC train')
ax.plot(k, auc_cv, label='AUC CV')
plt.title('AUC vs K')
plt.xlabel('K')
plt.ylabel('AUC')
ax.legend()
plt.show()

In [None]:
# Hypermater tunning using 10-fold CV
k=list(range(1,30,2))
#empty list that will hold cv scores
cv_scores=[]
# perform 10-fold CV
for i in tqdm(k):
    knn_avg_w2vec=KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='brute')
    score=cross_val_score(knn_avg_w2vec,X_tr,y_train,cv=10,scoring='accuracy',n_jobs=-1)
    cv_scores.append(score.mean())

    
#determining best k
MSE=[1- x for x in cv_scores]
optimal_k=k[MSE.index(min(MSE))]
print("_" * 101)
print("Optimal number of neighbors: ", optimal_k)
print("_" * 101)
print("Missclassification error for each k values: ", np.round(MSE, 3))
print("_" * 101)

# plot error vs k 
plt.plot(k,MSE)
plt.title("Number of neighbors and error")
plt.xlabel("Number of neighbors")
plt.ylabel("Missclassification error")
plt.grid()
plt.show()

In [None]:
# ============================== KNN with k = optimal_k ===============================================
# learning model k = optimal
knn_optimal_model=KNeighborsClassifier(n_neighbors=optimal_k,algorithm='brute')
knn_optimal_model.fit(X_tr,y_train)
y_test_predict=knn_optimal_model.predict_proba(X_test)[:,1]
fpr1,tpr1,threshold1=metrics.roc_curve(y_test,y_test_predict)

y_tr_pred=knn_optimal_model.predict_proba(X_tr)[:,1]
fpr2,tpr2,threshold2=metrics.roc_curve(y_train,y_tr_pred)

fig=plt.figure()
ax=plt.subplot(111)
ax.plot(fpr1, tpr1, label='Test ROC ,auc='+str(roc_auc_score(y_test,y_test_predict)))
ax.plot(fpr2, tpr2, label='Train ROC ,auc='+str(roc_auc_score(y_train,y_tr_pred)))
plt.title('ROC')
plt.xlabel('FPR')
plt.ylabel('TPR')
ax.legend()
plt.show()

In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix
knn_optimal_model=KNeighborsClassifier(n_neighbors=optimal_k,weights='uniform',algorithm='brute',metric='cosine')
knn_optimal_model.fit(X_tr,y_train)
y_test_pred=knn_optimal_model.predict(X_test)

import seaborn as sns
conf_mat=confusion_matrix(y_test,y_test_pred)
class_label=['negative','positive']
df=pd.DataFrame(conf_mat,index=class_label,columns=class_label)
sns.heatmap(df,annot=True,fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Applying KNN brute force on tfidf_w2v

In [None]:
y=final['Score']
X=final['Cleaned_text']
print("Shape of X",X.shape)
print("Shape of y",y.shape)
from sklearn.model_selection import train_test_split
# Splitting the data in train ,test and Cv dataset
x, X_test, y, y_test = train_test_split(X, y, test_size=.30, random_state=0)
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=.30, random_state=0)
print('Shape of X_train is :',X_train.shape)
print('Shape of y_train is :',y_train.shape)
print("****"*6)
print('Shape of X_Cv is :',X_cv.shape)
print('Shape of y_cv is :',y_cv.shape)
print("****"*6)
print('Shape of X_test is :',X_test.shape)
print('Shape of y_test is :',y_test.shape)

In [None]:
import gensim
list_of_sentences_tr=[]
for sentences in tqdm(X_train):
    list_of_sentences_tr.append(sentences.split())
w2v_model=gensim.models.Word2Vec(list_of_sentences_tr,min_count=5,size=50, workers=4)
w2v_words=list(w2v_model.wv.vocab)

tf_idf_vect=TfidfVectorizer(ngram_range=(1,2),min_df=10,max_features=500)
tf_idf_matrix=tf_idf_vect.fit_transform(X_train)

tfidf_feat=tf_idf_vect.get_feature_names()
dictionary=dict(zip(tf_idf_vect.get_feature_names(),list(tf_idf_vect.idf_)))


In [None]:
#For train Data
tf_idf_vect_tr= []
row=0
for  sentence in tqdm(list_of_sentences_tr):
    sentence_vect=np.zeros(50)
    weight_sum=0
    for word in sentence:
        if word in w2v_words and word in tfidf_feat :
            vec=w2v_model.wv[word]
            tf_idf=dictionary[word]*(sentence.count(word)/len(sentence))
            sentence_vect+=(vec * tf_idf)
            weight_sum+=tf_idf
    if weight_sum!=0:
        sentence_vect /=weight_sum
    tf_idf_vect_tr.append(sentence_vect)
    row+=1

In [None]:
#For test Data
list_of_sentences_test=[]
for sentence in X_test:
    list_of_sentences_test.append(sentence.split())

tf_idf_vect_test= []
row=0
for sentence in tqdm(list_of_sentences_test):
    sentence_vect=np.zeros(50)
    weight_sum=0
    for word in sentence:
        if word in w2v_words and word in tfidf_feat :
            vec=w2v_model.wv[word]
            tf_idf=dictionary[word]*(sentence.count(word)/len(sentence))
            sentence_vect+=(vec * tf_idf)
            weight_sum+=tf_idf
    if weight_sum!=0:
        sentence_vect /=weight_sum
    tf_idf_vect_test.append(sentence_vect)
    row+=1

In [None]:
#For cv Data
list_of_sentences_cv=[]
for sentence in X_cv:
    list_of_sentences_cv.append(sentence.split())

tf_idf_vect_cv= []
row=0
for sentence in tqdm(list_of_sentences_cv):
    sentence_vect=np.zeros(50)
    weight_sum=0
    for word in sentence:
        if word in w2v_words and word in tfidf_feat :
            vec=w2v_model.wv[word]
            tf_idf=dictionary[word]*(sentence.count(word)/len(sentence))
            sentence_vect+=(vec * tf_idf)
            weight_sum+=tf_idf
    if weight_sum!=0:
        sentence_vect /=weight_sum
    tf_idf_vect_cv.append(sentence_vect)
    row+=1

In [None]:
#Applying KNN on tfidf_avg_w2v
x_train=tf_idf_vect_tr
x_cv=tf_idf_vect_cv
x_test=tf_idf_vect_test
auc_cv=[]
auc_train=[]
k=list(range(1,30,2))
for i in tqdm(k):
    knn=KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='brute')
    knn.fit(x_train,y_train)
    
    y_cv_pred=knn.predict_proba(x_cv)[:,1]
    auc_cv.append(roc_auc_score(y_cv,y_cv_pred))
    
    y_tr_pred=knn.predict_proba(x_train)[:,1]
    auc_train.append(roc_auc_score(y_train,y_tr_pred))
fig=plt.figure()
ax=plt.subplot(111)
ax.plot(k, auc_train, label='AUC train')
ax.plot(k, auc_cv, label='AUC CV')
plt.title('AUC vs K')
plt.xlabel('K')
plt.ylabel('AUC')
ax.legend()
plt.show()

In [None]:
# finding best k using K-fold croos validation
k=list(range(1,50,2))

#empty list to hold CV scores/accuracy
cv_scores=[]

# performing 10 fold cross validation
for i in tqdm(k):
    knn = KNeighborsClassifier(n_neighbors=i,algorithm='brute',weights='uniform')
    scores= cross_val_score(knn,x_train,y_train,cv=10,scoring='accuracy')
    cv_scores.append(scores.mean())
    
#Changing to missclassification
MSE=[1- x for x in cv_scores]

#determining best K
optimal_k=k[MSE.index(min(MSE))]
print('\nThe optimal number of neighbors is %d.' % optimal_k)


# plot misclassification error vs k 
plt.plot(k,MSE)

for xy in zip(k,np.round(MSE,3)):
    plt.annotate('(%s, %s)' % xy, xy=xy, textcoords='data')
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

print("the misclassification error for each k value is : ", np.round(MSE,3))
print('*********************************************************************')
print("optimal k value is: ",optimal_k)

In [None]:
# ============================== KNN with k = optimal_k ===============================================
# learning model k = optimal
#import metrics
x_test=tf_idf_vect_test
knn_optimal_model=KNeighborsClassifier(n_neighbors=optimal_k,algorithm='brute')
knn_optimal_model.fit(x_train,y_train)
y_test_predict=knn_optimal_model.predict_proba(x_test)[:,1]
fpr1,tpr1,threshold1=metrics.roc_curve(y_test,y_test_predict)

y_tr_pred=knn_optimal_model.predict_proba(x_train)[:,1]
fpr2,tpr2,threshold2=metrics.roc_curve(y_train,y_tr_pred)

fig=plt.figure()
ax=plt.subplot(111)
ax.plot(fpr1, tpr1, label='Test ROC ,auc='+str(roc_auc_score(y_test,y_test_predict)))
ax.plot(fpr2, tpr2, label='Train ROC ,auc='+str(roc_auc_score(y_train,y_tr_pred)))
plt.title('ROC')
plt.xlabel('FPR')
plt.ylabel('TPR')
ax.legend()
plt.show()

In [None]:
#confusion matrix 
from sklearn.metrics import confusion_matrix
knn_=KNeighborsClassifier(n_neighbors=optimal_k,weights='uniform',algorithm='brute',metric='cosine')
knn_.fit(x_train,y_train)
y_test_pred=knn_.predict(x_test)

import seaborn as sns
conf_mat=confusion_matrix(y_test,y_test_pred)
class_label=['negative','positive']
df=pd.DataFrame(conf_mat,index=class_label,columns=class_label)
sns.heatmap(df,annot=True,fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Applying KNN (kd_tree)

In [None]:
# importing important library 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import roc_auc_score

In [None]:
y=final['Score']
X=final['Cleaned_text']
print("Shape of X",X.shape)
print("Shape of y",y.shape)

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data in train ,test and Cv dataset
x, X_test, y, y_test = train_test_split(X, y, test_size=.30, random_state=0)
X_train, X_cv, y_train, y_cv = train_test_split(x, y, test_size=.30, random_state=0)
print('Shape of X_train is :',X_train.shape)
print('Shape of y_train is :',y_train.shape)
print("****"*6)
print('Shape of X_Cv is :',X_cv.shape)
print('Shape of y_cv is :',y_cv.shape)
print("****"*6)
print('Shape of X_test is :',X_test.shape)
print('Shape of y_test is :',y_test.shape)

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
# Converting text into vector Using BOW
coun_vect=CountVectorizer(min_df=10,max_features=500)
coun_vect.fit(X_train) # fit has to happen only on train data

# we use the fiited Countvectorizer to convert the text into vectors
train_bow=coun_vect.transform(X_train)
cv_bow=coun_vect.transform(X_cv)
test_bow=coun_vect.transform(X_test)
#sklearn implementation of kd-tree accepts only dense matrices,
#you need to convert the sparse matrices of CountVectorizer/TfidfVectorizer into dense matices. 
#You can convert sparse matrices to dense using .toarray() attribute.
X_train_bow=train_bow.toarray()
X_cv_bow=cv_bow.toarray()
X_test_bow=test_bow.toarray()
print("After vectorizations")
print(X_train_bow.shape, y_train.shape)
print(X_cv_bow.shape, y_cv.shape)
print(X_test_bow.shape, y_test.shape)
print("="*100)


In [None]:
### Hyper Parameter tuning Method 1: Simple 'for' loop
for i in range(1,30,2):
    # k = 1,3,5,7,...29
    knn = KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='kd_tree')
    
    
    # fit the modelon CV Train
    knn.fit(X_train_bow, y_train)
    
    #Taking Cv data and passing it to the trained data to predict
    pred = knn.predict(X_cv_bow)
    
    # Evaluating CV accuracy i.e Model Accuracy
    # y_cv is actual label and pred is predicted label
    acc = accuracy_score(y_cv, pred, normalize=True) * float(100)
    print('\n CV Accuracy for k = %d is %d %%' %(i,acc))

In [None]:
#

In [None]:
#

In [None]:
#

In [None]:
#