In [None]:

# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse



# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()


# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}


# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

print('Files in Drive:')
!ls drive/
'''
# Create a file in Drive.
!echo "This newly created file will appear in your Drive file list." > drive/created.txt
'''

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.
Files in Drive:
1APP presentation.ppt				 Google Buzz
Annual budget.ods				 H2.xlsx
blob.odt					 H2.xlsx.ods
Classroom					 Mukesh.pdf
Colab Notebooks					 Reviews.csv
Copy of Amazon Fine Food Reviews Analysis.ipynb  syntel data sheet.ods
created.txt					 To-do list.ods
database.sqlite					 Untitled form.zip

'\n# Create a file in Drive.\n!echo "This newly created file will appear in your Drive file list." > drive/created.txt\n'


# Amazon Fine food review Prediction

Data Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

Data is available in both CSV and SQLite, we will be using sqllite file.

SQLite is an embedded SQL database engine. Unlike most other SQL databases, SQLite does not have a separate server process. SQLite reads and writes directly to ordinary disk files. A complete SQL database with multiple tables, indices, triggers, and views, is contained in a single disk file.

Below are the list of Features available for every Review record.

1. Id
2. ProductId - unique identifier for the product
3. UserId - unqiue identifier for the user
4. ProfileName
5. HelpfulnessNumerator - number of users who found the review helpful
6. HelpfulnessDenominator - number of users who indicated whether they found the review helpful
7. Score - rating between 1 and 5
8. Time - timestamp for the review
9. Summary - brief summary of the review
10. Text - text of the review

Objective:
Given a review, find out whether it is positive or negative.

In [None]:
import pandas as pd
import numpy as np
import sqlite3

In [None]:

# Reading Data from SQLite
# 1. Make a connection to SQLite file : https://www.dataquest.io/blog/python-pandas-databases/

con = sqlite3.connect('drive/database.sqlite')

# 2. Get the cursor and execute query - Sample execution to understand working with sqlite without pandas..
cur = con.cursor()
cur.execute("select * from reviews limit 5 ")
rows = cur.fetchall()
#print(rows)
cur.close()

# 3. Read the data into pandas using read_sql_query() function in pandas

source_data = pd.read_sql_query('select * from reviews where score != 3',con)
print('*'*24)
print(source_data.shape)
source_data.columns

************************
(525814, 10)


Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [None]:
con.close()

In [None]:
def part(x):
    if x<3:
        return 'positive'
    else:
        return 'negative'

source_data.Score = source_data['Score'].map(part)

In [None]:
print(source_data.shape)
print(source_data['Score'].value_counts())
source_data.head()

(525814, 10)
negative    443777
positive     82037
Name: Score, dtype: int64


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,negative,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,positive,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,negative,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,positive,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,negative,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
## remove records with Numerator > Denominator

filter1 = source_data[source_data['HelpfulnessNumerator']<=source_data['HelpfulnessDenominator']]
print(filter1.shape)
filter1['Score'].value_counts()

(525812, 10)


negative    443775
positive     82037
Name: Score, dtype: int64

# Deduplication Process

In [None]:
## Noticed that Text is the key feature so would like to remove duplicates
## Below is the code to find duplicates in Text column

filter2 = filter1[filter1.duplicated(['Text'],keep=False)].sort_values('Text')
print(filter2.shape)
filter2.head()

(215639, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
237606,257786,B000KOWR8E,A142S4ZZF1FJ1X,Joseph E Brew,2,3,negative,1286582400,Better Sweetener!,"""4C Totally Light"" is one of the very few ""sug..."
468638,506746,B000KOWR8Y,A142S4ZZF1FJ1X,Joseph E Brew,0,0,negative,1286582400,4C Totally Light,"""4C Totally Light"" is one of the very few ""sug..."
99163,107705,B001F0RRTQ,A1R7E82MN0S8V3,DENNIS,0,0,negative,1339459200,GREAT DOG TREAT,"""BUFFY"" LOOKS FORWARD TO HER ""TOY"" EVERY AFTER..."
387101,418610,B001F0RRU0,A1R7E82MN0S8V3,DENNIS,0,0,negative,1339459200,GREAT DOG TREAT,"""BUFFY"" LOOKS FORWARD TO HER ""TOY"" EVERY AFTER..."
304784,330090,B001OHX1ZY,A7FNPP1SMY97G,D. Hsu,4,6,positive,1320710400,Buy this if you have NO taste buds!,"""Blends smooth and creamy for a sweet tasting ..."


when we look at the duplicate records with same text , could see that same person at same time given review for two/more similar products, which is not possible in real time.
Eg: first two records in above output.

Hence we would be removing the duplicates in text, considering userid, profile name, time, text.

In [None]:
#Sorting data according to ProductId in ascending order
sorted_data=filter1.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
#Deduplication of entries
filter2 = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
filter2.shape

(364171, 10)

In [None]:
temp = filter2[filter2.duplicated(['UserId',"ProfileName","Text"],keep=False)].sort_values('Text')
print(temp.shape)
temp.head()

(505, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
287090,311004,B001EO6FPU,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",9,9,negative,1297036800,Great Diabetic Friendly Sweetener - Highly Rec...,"""Erythritol"" has become one of our favorite su..."
67574,73444,B0046IISFG,A3OXHLG6DIBRW8,"C. F. Hill ""CFH""",1,1,negative,1342915200,Great Diabetic Friendly Sweetwner - Highly Rec...,"""Erythritol"" has become one of our favorite su..."
302818,327982,B0000CEQ6H,A281NPSIMI1C2R,"Rebecca of Amazon ""The Rebecca Review""",3,3,negative,1084492800,Superior for Bread Baking,"""We use and believe in stone milling because n..."
494235,534333,B0000CEQ72,A281NPSIMI1C2R,"Rebecca of Amazon ""The Rebecca Review""",1,1,negative,1093651200,Bob's Red Mill Whole Wheat Flour,"""We use and believe in stone milling because n..."
164025,177904,B000PSFW9Q,A1YUL9PCJR3JTY,"O. Brown ""Ms. O. Khannah-Brown""",1,1,negative,1156723200,Perfect Morning Tea (Caffeinated),*****<br /><br />Numi Tea's Chinese Breakfast ...


Removing Furthur records with same text and summary - by user id, summary and text

In [None]:
#Sorting data according to ProductId in ascending order
sorted_data=filter2.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
#Deduplication of entries
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Text"}, keep='first', inplace=False)
final.shape

(363897, 10)

# Exploratory Data Analysis

Inorder to perform Analysis we need to convert text to numeric vectors. We will follow below different approaches to convert to numeric vectors.

1. Bag of Words
2. TF IDF
3. Word2Vec - Avg, Weighted(TF-IDF) W2V ( Considers Semantic Meaning to generate Vectors)

Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)

In [None]:
# Using Regular expression we will perform https://pymotw.com/2/re/
import re

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunch(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned


In [None]:
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords


In [None]:
nltk.download()
stopwords = set(stopwords.words('english'))
snowstemmer = SnowballStemmer('english')

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> stopwords
    Downloading package stopwords to /content/nltk_data...
      Unzipping corpora/stopwords.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [None]:
print('*'*10,'Stop Words','*'*10,'\n',stopwords)

********** Stop Words ********** 
 {'hadn', 'yourself', 'by', 'hers', 'this', "didn't", 'more', "shan't", 'have', 'through', 'between', 'whom', 'if', "shouldn't", 'ma', 'on', 'an', 'some', 'same', 'then', 'who', 'be', 'wouldn', 'or', 'nor', "should've", 'his', "isn't", 'wasn', 'couldn', 'only', 'her', 'can', 'few', 'am', 'those', 'again', 'been', 'having', 'down', 'she', 'my', 'of', "mightn't", 'm', 'we', "you'll", 'does', 'do', 'were', 'i', 'doing', 'our', 'when', 'both', 'ain', 'there', "you've", 'did', 'than', "don't", 'isn', 'at', 'myself', 'you', 's', 'which', 'himself', 'any', 'has', 'weren', 'further', 't', 'up', "wouldn't", 'y', 'him', 'will', "hasn't", 'hasn', 'against', "that'll", 're', 'o', 'out', 'during', "doesn't", 'themselves', 'about', 'yours', 'he', 'them', "it's", 'and', 'how', 'haven', 'once', 'don', 'such', 'under', "you're", "couldn't", "mustn't", 'what', 'no', 'now', "aren't", 'll', 'where', 'was', 'after', "wasn't", 'it', 'they', 'with', 'too', 'here', 'for', 'th

In [None]:
all_negative_words = []
all_positive_words = []
final_string = []
score_values = final['Score'].values
i=0
for text in final.Text.values:
    text = cleanhtml(text)
    filtered_sentence = []
    str1=""
    #print(text)
    for word in cleanpunch(text).split():
        if ((word.isalpha())&(len(word)>2)):
            if (word.islower() not in stopwords):
                s=(snowstemmer.stem(word.lower())).encode('utf8')
                filtered_sentence.append(s)
                if score_values[i] == 'positive':
                    all_positive_words.append(s) #list of all words used to describe positive reviews
                if score_values[i] == 'negative':
                    all_negative_words.append(s) #list of all words used to describe negative reviews reviews
            else:
                continue
        else:
            continue
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    final_string.append(str1)
    i+=1

In [None]:
#print(final_string.shape)
print(final_string[1:10])
final['CleanedText'] = final_string

[b'this great littl book read has nice rhythm well good repetit that littl one like the line about chicken soup with rice the child get through the month the year and wonder place like bombay and down the nile all the while eat well you know what they get eat some kid will have mauric sendak version ice skate how treat rose their head for long time and they wont even know where came from surpris came from this littl witti book', b'this fun way for children learn their month the year will learn all the poem throughout the school year they like the handmot which invent for each poem', b'grew read these sendak book and watch the realli rosi movi that incorpor them and love them son love them too howev miss the hard cover version the paperback seem kind flimsi and take two hand keep the page open', b'get the movi sound track and sing along with carol king this great stuff whole extend famili know these song heart qualiti kid storytel and music', b'veri entertain rhyme and catchi the illust

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138693,150511,6641040,A1C9K534BCI9GO,Laura Purdie Salas,0,0,negative,1344211200,Charming and childlike,"A charming, rhyming book that describes the ci...",b'charm rhyme book that describ the circumst u...
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,negative,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,b'this great littl book read has nice rhythm w...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,negative,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,b'this fun way for children learn their month ...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,negative,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",b'grew read these sendak book and watch the re...
138687,150505,6641040,A2PTSM496CF40Z,"Jason A. Teeple ""Nobody made a greater mistak...",1,1,negative,1210809600,A classic,Get the movie or sound track and sing along wi...,b'get the movi sound track and sing along with...


In [None]:
'''
conn = sqlite3.connect('drive/final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, flavor=None, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)
conn.close()
'''

"\nconn = sqlite3.connect('drive/final.sqlite')\nc=conn.cursor()\nconn.text_factory = str\nfinal.to_sql('Reviews', conn, flavor=None, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)\nconn.close()\n"

# BOW - Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bow_ds = cv.fit_transform(final['Text'].values)


In [None]:
print(type(bow_ds))
print(bow_ds.get_shape())
#print(cv.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
(363897, 115281)


In [None]:
# Bi Gram
cv_2gr = CountVectorizer(ngram_range=(1,2))
bigram_bow_ds = cv_2gr.fit_transform(final['Text'].values)


In [None]:
print(bigram_bow_ds.get_shape())
print(cv_2gr.get_feature_names()[200000:200020])

(363897, 2910192)
['around eat', 'around eating', 'around ebay', 'around edge', 'around edges', 'around effects', 'around eight', 'around either', 'around elsewhere', 'around empty', 'around ended', 'around endlessly', 'around ends', 'around enjoy', 'around enjoyed', 'around enjoying', 'around enough', 'around esp', 'around especially', 'around espresso']


## TF IDF - Text Frequency & Inverse Document Frequency

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

#tf = TfidfVectorizer(ngram_range=(1,2))
tf = TfidfVectorizer()
tf_ds = tf.fit_transform(final['Text'].values)

In [None]:
print(tf_ds.get_shape())
print(tf.get_feature_names()[2000:2020])

(363897, 115281)
['210miligrams', '211', '2110', '2114', '212', '2120mg', '21261516', '213', '2138725', '214', '2143kcal', '215', '2150mg', '2154', '215g', '215grams', '216', '2160', '21619', '2167cx']


## Word2Vec Model

In [None]:
!pip install --upgrade gensim
#!easy_install -U gensim

Collecting gensim
  Downloading gensim-3.3.0-cp36-cp36m-manylinux1_x86_64.whl (22.5MB)
[K    100% |████████████████████████████████| 22.5MB 63kB/s 
[?25hCollecting smart-open>=1.2.1 (from gensim)
  Downloading smart_open-1.5.6.tar.gz
Collecting scipy>=0.18.1 (from gensim)
  Downloading scipy-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (50.0MB)
[K    100% |████████████████████████████████| 50.0MB 28kB/s 
[?25hRequirement already up-to-date: six>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from gensim)
Requirement already up-to-date: numpy>=1.11.3 in /usr/local/lib/python3.6/dist-packages (from gensim)
Collecting boto3 (from smart-open>=1.2.1->gensim)
  Downloading boto3-1.5.26-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 7.3MB/s 
[?25hCollecting boto>=2.32 (from smart-open>=1.2.1->gensim)
  Downloading boto-2.48.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 1.0MB/s 
[?25hCollecting bz2file (from smart-open>

In [None]:
import gensim

## Cleaning the data
i=0
list_of_sent=[]
for sent in final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunch(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)



In [None]:
#list_of_sent[0]
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=4)


In [None]:
words = list(w2v_model.wv.vocab)
print(len(words))

33737


In [None]:
#w2v_model.wv.most_similar('charming')
w2v_model.wv['charming']

array([-4.76558916e-02, -2.97644913e-01,  4.87414479e-01, -2.39514142e-01,
       -2.30322704e-02, -1.06650345e-01, -4.56724130e-02,  4.43276092e-02,
        1.91595986e-01, -3.75310169e-03,  1.41314447e-01, -1.72058851e-01,
       -6.05750568e-02, -4.00242507e-01, -1.43326789e-01,  2.12644309e-01,
       -1.44621208e-01,  2.30137765e-01, -4.25470173e-01,  1.60405904e-01,
        2.29007557e-01, -1.09626189e-01, -2.17108279e-01, -2.33942494e-01,
       -1.22239172e-01, -1.14763163e-01, -2.55856961e-02,  3.06401681e-02,
        2.01803476e-01, -6.76738396e-02,  3.79949063e-01,  2.54258290e-02,
       -4.63318527e-02, -7.38562420e-02,  1.58685893e-01,  2.56230742e-01,
       -1.26942424e-02, -3.08156341e-01,  2.76695155e-02,  3.36574703e-01,
        9.68114837e-05, -3.91068548e-01, -2.92272754e-02, -4.33200926e-01,
       -2.13100985e-01,  9.90905538e-02,  1.14401221e-01,  1.61946788e-02,
       -9.37332958e-02,  1.64955065e-01], dtype=float32)

In [None]:
#Avg W2V :
# (Vector)/(# of words in a vector)

sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of 50 length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

  del sys.path[0]


363897
50


In [None]:
'''import datetime as dt

start = dt.datetime.now()
temp_ds = tf_ds.toarray().tolist()[0]
print(temp_ds.shape)
end = dt.datetime.now()
print('time taken: ',(end-start))
'''

"import datetime as dt\n\nstart = dt.datetime.now()\ntemp_ds = tf_ds.toarray().tolist()[0]\nprint(temp_ds.shape)\nend = dt.datetime.now()\nprint('time taken: ',(end-start))\n"

In [None]:
import datetime as dt
start = dt.datetime.now()

for i in range(100):
    row_rec = tf_ds[1].toarray().tolist()[0]
end = dt.datetime.now()
print(start, end, (end-start))
type(row_rec)


2018-02-10 15:09:55.830802 2018-02-10 15:09:56.088890 0:00:00.258088


list

In [None]:
tfidf_feat = tf.get_feature_names() # tfidf words/col-names
print(tfidf_feat[:10])
#ls = list(np.unique(tfidf_feat))
#print(ls[:10])
dict1 = dict(enumerate(tfidf_feat))
dict2 = dict(zip(dict1.values(),dict1.keys()))
print(dict2['000013'])

['00', '000', '0000', '000001', '00001', '000013', '0000soo', '0001', '000111052', '0002251337']
5


In [None]:

import datetime as dt

start = dt.datetime.now()
# TF-IDF weighted Word2Vec
tfidf_feat = tf.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf
print(type(tfidf_feat))

not_found=[]
tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;

for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    row_rec = tf_ds[1].toarray().tolist()[0]
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word not in not_found:
            try:
                vec = w2v_model.wv[word]
                # obtain the tf_idfidf of a word in a sentence/review
                #tf_idf = tf_ds[row, tfidf_feat.index(word)]
                tf_idf = tf_ds[row, dict2[word]]
                #tf_idf = row_rec[tfidf_feat.index(word)]
                sent_vec += (vec * tf_idf)
                weight_sum += tf_idf
            except Exception as e:
                #print(word,'entered',e)
                not_found.append(word)
                pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1
    not_found = list(set(not_found))
    if (row%1000)==0:
        print('reached',row)
    #if row==1000:
    #    break

end = dt.datetime.now()
print(start)
print(end)
print('Total time', (end-start))


<class 'list'>
reached 1000
reached 2000
reached 3000
reached 4000
reached 5000
reached 6000
reached 7000
reached 8000
reached 9000
reached 10000
reached 11000
reached 12000
reached 13000
reached 14000
reached 15000
reached 16000
reached 17000
reached 18000
reached 19000
reached 20000
reached 21000
reached 22000
reached 23000
reached 24000
reached 25000
reached 26000
reached 27000
reached 28000
reached 29000
reached 30000
reached 31000
reached 32000
reached 33000
reached 34000
reached 35000
reached 36000
reached 37000
reached 38000
reached 39000
reached 40000
reached 41000
reached 42000
reached 43000
reached 44000
reached 45000
reached 46000
reached 47000
reached 48000
reached 49000
reached 50000
reached 51000
reached 52000
reached 53000
reached 54000
reached 55000
reached 56000
reached 57000
reached 58000
reached 59000
reached 60000
reached 61000
reached 62000
reached 63000
reached 64000
reached 65000
reached 66000
reached 67000
reached 68000
reached 69000
reached 70000
reached 71000


KeyboardInterrupt: ignored

In [None]:
print(len(not_found),'\n',list(set(not_found)))