In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from tqdm.notebook import tqdm
sns.set()

In [2]:
#create connection to sql database
conn = sqlite3.connect('database.sqlite')

In [3]:
q = """SELECT name FROM sqlite_master WHERE type='table'"""
tables = pd.read_sql_query(q,conn)

In [4]:
tables

Unnamed: 0,name
0,Reviews


In [5]:
q = """SELECT * FROM Reviews WHERE Score != 3"""
df = pd.read_sql_query(q,conn)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
#568454
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525814 entries, 0 to 525813
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      525814 non-null  int64 
 1   ProductId               525814 non-null  object
 2   UserId                  525814 non-null  object
 3   ProfileName             525814 non-null  object
 4   HelpfulnessNumerator    525814 non-null  int64 
 5   HelpfulnessDenominator  525814 non-null  int64 
 6   Score                   525814 non-null  int64 
 7   Time                    525814 non-null  int64 
 8   Summary                 525814 non-null  object
 9   Text                    525814 non-null  object
dtypes: int64(5), object(5)
memory usage: 40.1+ MB


1. Id - the id of the row
2. ProductId- the id of the particular product
3. UserId - the id of the user 
4. ProfileName - the profile name of the user
5. HelpfulnessNumerator - the no of user who found the review helpful
6. HelpfulnessDenominator - the no of user who voted the answer
7. Score - the rating given by the user to the product
8. Time - the time when the review was added
9. Summary -  the title of the review
10. Text -  the actual review

In [7]:
#Data cleaning
#check for reviews other than food
#check for duplicate entries
#check for inconsistent data
df.sort_values('ProductId',axis=0,inplace=True)
clean_df = df.drop_duplicates(subset={'UserId','Score','Time','Text'},keep='first')
clean_df.shape

(364183, 10)

In [8]:
#remaining %of data
sum(clean_df['Id'].value_counts())/sum(df['Id'].value_counts())

0.6926080324981838

In [9]:
#inconsistent data
clean_df.loc[clean_df.HelpfulnessNumerator>clean_df.HelpfulnessDenominator]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
59301,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
41159,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [10]:
#drop inconsistent data
clean_df1 = clean_df.drop(clean_df['Id'].loc[clean_df.HelpfulnessNumerator>clean_df.HelpfulnessDenominator])

In [11]:
#convert the Text and Summary cols into lower case
lower_summary = clean_df1.Summary.str.lower()
clean_df1['Summary'] = lower_summary
lower_text = clean_df1.Text.str.lower()
clean_df1['Text'] = lower_text

In [12]:
clean_df1.reset_index()
clean_df1.shape

(364181, 10)

In [13]:
import re
#removing reviews other than food like book or music
patterns=[r'\bbooks?\b','\breads?\b',r'\breading\b',r'\bpoetry\b',r'\bmusic\b',r'\bplay\b',r'\bplaying\b',r'\bmovies?\b',r'\bpoems?\b']
final=clean_df1
for pattern in tqdm(patterns):
    final=final.drop(list(final[final.Text.str.contains(pattern)].index),axis=0)
    final=final.drop(list(final[final.Summary.str.contains(pattern)].index),axis=0)
final

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))




Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
138686,150504,0006641040,AQEYF1AXARWJZ,"Les Sinclair ""book maven""",1,1,4,1212278400,chicken soup with rice,a very entertaining rhyming story--cleaver and...
476617,515426,141278509X,AB1A5EGHHVA9M,CHelmic,1,1,5,1332547200,the best drink mix,this product by archer farms is the best drink...
22621,24751,2734888454,A1C298ITT645B6,Hugh G. Pritchard,0,0,5,1195948800,dog lover delites,our dogs just love them. i saw them in a pet ...
22620,24750,2734888454,A13ISQV0U9GZIC,Sandikaye,1,1,2,1192060800,made in china,my dogs loves this chicken but its a product f...
157850,171161,7310172001,AFXMWPNS1BLU4,H. Sandler,0,0,5,1229385600,excellent treats,i have been feeding my greyhounds these treats...
...,...,...,...,...,...,...,...,...,...,...
178145,193174,B009RSR8HO,A4P6AN2L435PV,romarc,0,0,5,1350432000,love!! love!!,"love, love this sweetener!! i use it in all m..."
173675,188389,B009SF0TN6,A1L0GWGRK4BYPT,Bety Robinson,0,0,5,1350518400,amazing!! great sauce for everything!,you have to try this sauce to believe it! it s...
204727,221795,B009SR4OQ2,A32A6X5KCP7ARG,sicamar,1,1,5,1350604800,awesome taste,i bought this hazelnut paste (nocciola spread)...
5259,5703,B009WSNWC4,AMP7K1O84DH1T,ESTY,0,0,5,1351209600,delicious,purchased this product at a local store in ny ...


In [14]:
#remaining data size
sum(final['Id'].value_counts())/sum(df['Id'].value_counts())

0.6767602231967959

In [15]:
#remove unwanted text from the reviews like instead of i've keep i have 
#replace n't, 'll, 's, 've, 're, 't, 'd, 'm
from tqdm.notebook import tqdm
def subsitute(text):
    text=re.sub(r'can\'t','can not',text)
    text=re.sub(r'won\'t','will not',text)
    
    text=re.sub(r'n\'t',' not',text)
    text=re.sub(r'\'ll',' will',text)
    text=re.sub(r'\'s',' is',text)
    text=re.sub(r'\'ve',' have',text)
    text=re.sub(r'\'re',' are',text)
    text=re.sub(r'\'t',' not',text)
    text=re.sub(r'\'d',' would',text)
    text=re.sub(r'\'m',' am',text)
    return text
    

In [16]:
#remove html tags
def removeHTML(text):
    text=re.sub(r'https?\S+','',text)
    text=re.sub(r'<.*?>','',text)
    return text

In [17]:
def removePunctuation(text):
    text=re.sub(r'[^A-za-z\s]','',text)#remove puntuations
    text=re.sub(r'\S*\d\S*','',text)#remove alphanumeric words
    return text

In [18]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [19]:
#clean the text review column and store in a list
final_reviews=[]
#test=[""""newman's own" products are generally very good, with high quality ingediants and perfect execution.  this organic dark chocolate bar (54% cocoa) was perfectly sweet, without being bitter or too rich.  two of us spent an hour slowly munching through one bar, savoring every bite.<br /><br />a real treat of high quality dark chocolate.  definitely worth a try!"""]
#final.Text.values
for text in tqdm(final.Text.values):
    text= removeHTML(text)#removes html tags and attributes
    text= subsitute(text)#converts short words to normal words
    text= removePunctuation(text).strip()#removes punctuation and alphanumeric words
    text = ' '.join(ele for ele in text.split() if ele not in stopwords)
    final_reviews.append(text.strip())
final_reviews

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=355850.0), HTML(value='')))




['entertaining rhyming storycleaver catchythe illustrations imaginative fit right however paperback somewhat small flimsy would opt bigger edition',
 'product archer farms best drink mix ever mix flavored packet oz water bottle contains natural sweetner stevia real fruit flavoring no food coloring colored fruit vegetable colors pure natural tastes great eight packets box contains calories per packet thank archer farms',
 'dogs love saw pet store tag attached regarding made china satisfied safe',
 'dogs loves chicken product china wont buying anymore hard find chicken products made usa one isnt bad good product wont take chances till know going china imports',
 'feeding greyhounds treats years hounds little finicky love treats expensive relative biscuits find good addition diet treats easy teeth since protein treat careful not overindulge pet regular basis great treats take walks stuff pocket easily found prices vary vendor vendor shop around',
 'one product welsh terrier eat sophie foo

In [20]:
#clean the summary column and store in a list
final_summary=[]
#test=[""""newman's own" products are generally very good, with high quality ingediants and perfect execution.  this organic dark chocolate bar (54% cocoa) was perfectly sweet, without being bitter or too rich.  two of us spent an hour slowly munching through one bar, savoring every bite.<br /><br />a real treat of high quality dark chocolate.  definitely worth a try!"""]
#final.Text.values
for text in tqdm(final.Summary.values):
    text= removeHTML(text)#removes html tags and attributes
    text= subsitute(text)#converts short words to normal words
    text= removePunctuation(text).strip()#removes punctuation and alphanumeric words
    text = ' '.join(ele for ele in text.split() if ele not in stopwords)
    final_summary.append(text.strip())
final_summary

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=355850.0), HTML(value='')))




['chicken soup rice',
 'best drink mix',
 'dog lover delites',
 'made china',
 'excellent treats',
 'sophie treats',
 'best healthy dog treat',
 'alaskan malamute loves',
 'best treat ever',
 'year old maltese always loved',
 'dogs cats ferrets love',
 'snouts',
 'best dog treat ever',
 'great puppy training',
 'great',
 'terrific treats',
 'dog loves',
 'happy dog',
 'perfect small dogs',
 'good product slow super saver shipping',
 'dog treats',
 'great product dogs',
 'not buy used',
 'dogs love best treat rewards training',
 'great product',
 'amazing training treat',
 'best dog treat great training dogs love',
 'dog favorite treat',
 'careful dogs love',
 'pro treat dried dog treats',
 'great treat',
 'crack dogs',
 'good stuff',
 'great product great deal',
 'great diabetic dog',
 'good',
 'dogs love',
 'fast shipment',
 'dogs love',
 'golden loves',
 'corgi cocaine',
 'healthy high quality dog treat',
 'save best training',
 'great product value',
 'best dog treats',
 'dogs love'

In [21]:
#generate random reviews to check
for i in range(10):
    r=np.random.randint(0,356001)
    print(final.Text.iloc[r])
    print('='*100)

the pacific chai vanilla latte (decaffeinated) is so good. i like it in the evening as it does not disturb my sleep. it is a great alternative to coffee, hot chocolate or regular tea and the taste is wonderful.
i add this cherry berry and one of the other fruit packages to my tea each morning.  delicious and healthy.
this is good instant decaf coffee!!!  it smells good, it looks good, and the flavor is better than most cups of regular coffee served in restaurants.  that's my opinion and i'm stickin' to it.
i really enjoy african coffees and particularly the ethiopian varieties. the medium roast was probable appropriate for this bean, but the roasting was very inconsistent which gave it a green coffee bean flavor.  it was reasonalby priced but not worth buying.
what a convenience it is using the senseo coffee pods, no mess, pre-measured, and less than a minute from start to finish--which is a rich, flavorful cup of coffee!!<br />besides that, it is often difficult to locate a local stor

In [22]:
#Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction import stop_words



In [23]:
snbstemmer = SnowballStemmer("english")
wnl = WordNetLemmatizer()

In [24]:
summary_stem=[]
for ele in final_summary:
    words = ele.split()
    summary_stem.append(' '.join([snbstemmer.stem(word) for word in words]))

In [25]:
summary_stem[0:10]

['chicken soup rice',
 'best drink mix',
 'dog lover delit',
 'made china',
 'excel treat',
 'sophi treat',
 'best healthi dog treat',
 'alaskan malamut love',
 'best treat ever',
 'year old maltes alway love']

In [26]:
summary_lemma=[]
for ele in final_summary:
    words = ele.split()
    summary_lemma.append(' '.join([wnl.lemmatize(word) for word in words]))

In [27]:
summary_lemma[0:10]

['chicken soup rice',
 'best drink mix',
 'dog lover delites',
 'made china',
 'excellent treat',
 'sophie treat',
 'best healthy dog treat',
 'alaskan malamute love',
 'best treat ever',
 'year old maltese always loved']

In [28]:
text_stem=[]
for ele in final_reviews:
    words = ele.split()
    text_stem.append(' '.join([snbstemmer.stem(word) for word in words]))

In [29]:
text_lemma=[]
for ele in final_reviews:
    words = ele.split()
    text_lemma.append(' '.join([wnl.lemmatize(word) for word in words]))

In [30]:
text_stem[0:10]

['entertain rhyme storycleav catchyth illustr imagin fit right howev paperback somewhat small flimsi would opt bigger edit',
 'product archer farm best drink mix ever mix flavor packet oz water bottl contain natur sweetner stevia real fruit flavor no food color color fruit veget color pure natur tast great eight packet box contain calori per packet thank archer farm',
 'dog love saw pet store tag attach regard made china satisfi safe',
 'dog love chicken product china wont buy anymor hard find chicken product made usa one isnt bad good product wont take chanc till know go china import',
 'feed greyhound treat year hound littl finicki love treat expens relat biscuit find good addit diet treat easi teeth sinc protein treat care not overindulg pet regular basi great treat take walk stuff pocket easili found price vari vendor vendor shop around',
 'one product welsh terrier eat sophi food alergi care feed dog oscar also get treat love no food alergi product simpl ingredi no addit dog need'

In [31]:
type(text_stem)

list

In [32]:
text_lemma[0:10]

['entertaining rhyming storycleaver catchythe illustration imaginative fit right however paperback somewhat small flimsy would opt bigger edition',
 'product archer farm best drink mix ever mix flavored packet oz water bottle contains natural sweetner stevia real fruit flavoring no food coloring colored fruit vegetable color pure natural taste great eight packet box contains calorie per packet thank archer farm',
 'dog love saw pet store tag attached regarding made china satisfied safe',
 'dog love chicken product china wont buying anymore hard find chicken product made usa one isnt bad good product wont take chance till know going china import',
 'feeding greyhound treat year hound little finicky love treat expensive relative biscuit find good addition diet treat easy teeth since protein treat careful not overindulge pet regular basis great treat take walk stuff pocket easily found price vary vendor vendor shop around',
 'one product welsh terrier eat sophie food alergies careful feed

In [33]:
#now we have reduced the words into their stem/lemma forms
#using this we can create BoW, TF-IDF or W2Vec to actually convert the words into numbers\
#which we can use then for training classification algorithms.
#we will go with stemming
bow_summary = CountVectorizer(ngram_range=(1,2),min_df=10,max_features=5000)
summary_final_bow_vec = bow_summary.fit_transform(summary_stem)
bow_text = CountVectorizer(ngram_range=(1,2),min_df=10,max_features=5000)
text_final_bow_vec = bow_text.fit_transform(text_stem)

In [34]:
bow_summary.get_feature_names()[:100] #viewing 100 features of summary out of 5000

['abl',
 'absolut',
 'absolut amaz',
 'absolut best',
 'absolut delici',
 'absolut fantast',
 'absolut favorit',
 'absolut love',
 'absolut wonder',
 'acai',
 'accept',
 'accord',
 'accur',
 'acid',
 'acid coffe',
 'acquir',
 'acquir tast',
 'activ',
 'actual',
 'actual tast',
 'actual work',
 'ad',
 'ad sugar',
 'add',
 'addict',
 'addit',
 'ador',
 'adult',
 'adventur',
 'advertis',
 'aerat',
 'aerogarden',
 'afford',
 'african',
 'afternoon',
 'aftertast',
 'agav',
 'agav nectar',
 'age',
 'ago',
 'agre',
 'ah',
 'ahmad',
 'ai',
 'ai not',
 'aid',
 'air',
 'al',
 'alcohol',
 'ale',
 'alert',
 'aliv',
 'allerg',
 'allergen',
 'allergi',
 'allnatur',
 'allpurpos',
 'alltim',
 'alltim favorit',
 'almond',
 'almond butter',
 'almond flour',
 'almost',
 'almost good',
 'almost like',
 'almost perfect',
 'alo',
 'alon',
 'alot',
 'alreadi',
 'alright',
 'also',
 'altern',
 'altern coffe',
 'altern regular',
 'altern soda',
 'altern sugar',
 'although',
 'altoid',
 'alway',
 'alway good',


In [35]:
bow_text.get_feature_names()[4900:] #viewing 100 features of reviews

['work well',
 'work wonder',
 'workout',
 'world',
 'worm',
 'worri',
 'wors',
 'worst',
 'worth',
 'worth everi',
 'worth extra',
 'worth money',
 'worth price',
 'worth tri',
 'would',
 'would also',
 'would better',
 'would buy',
 'would definit',
 'would eat',
 'would expect',
 'would find',
 'would get',
 'would give',
 'would given',
 'would go',
 'would good',
 'would great',
 'would help',
 'would high',
 'would like',
 'would love',
 'would make',
 'would never',
 'would nice',
 'would not',
 'would order',
 'would pay',
 'would prefer',
 'would probabl',
 'would purchas',
 'would rate',
 'would rather',
 'would recommend',
 'would say',
 'would still',
 'would suggest',
 'would take',
 'would tast',
 'would think',
 'would tri',
 'would use',
 'would want',
 'would work',
 'wow',
 'wrap',
 'wrapper',
 'write',
 'write review',
 'written',
 'wrong',
 'wrote',
 'xylitol',
 'yard',
 'yeah',
 'year',
 'year ago',
 'year love',
 'year not',
 'year old',
 'year tri',
 'year use',


In [36]:
tfidf_summary = TfidfVectorizer(ngram_range=(1,2),min_df=10,max_features=5000)
summary_tfidf= tfidf_summary.fit_transform(final_summary)
tfidf_text = TfidfVectorizer(ngram_range=(1,2),min_df=10,max_features=5000,lowercase=False)
text_tfidf = tfidf_text.fit_transform(text_stem)

In [37]:
tfidf_summary.get_feature_names()#top 100 features of summary using tfidf

['able',
 'absolute',
 'absolute best',
 'absolute favorite',
 'absolutely',
 'absolutely amazing',
 'absolutely best',
 'absolutely delicious',
 'absolutely love',
 'absolutely wonderful',
 'acai',
 'acceptable',
 'accurate',
 'acid',
 'acid coffee',
 'acidic',
 'acquired',
 'acquired taste',
 'active',
 'actual',
 'actually',
 'actually tastes',
 'ad',
 'add',
 'added',
 'added sugar',
 'addict',
 'addicted',
 'addicting',
 'addiction',
 'addictive',
 'adding',
 'addition',
 'additive',
 'additives',
 'adds',
 'adorable',
 'adore',
 'adult',
 'adults',
 'advertised',
 'advertisement',
 'advertising',
 'aerator',
 'aerogarden',
 'affordable',
 'african',
 'afternoon',
 'aftertaste',
 'agave',
 'agave nectar',
 'ages',
 'ago',
 'agree',
 'ah',
 'ahmad',
 'ai',
 'ai not',
 'aid',
 'air',
 'al',
 'alcohol',
 'ale',
 'alert',
 'alive',
 'allergen',
 'allergic',
 'allergies',
 'allergy',
 'allnatural',
 'allpurpose',
 'alltime',
 'alltime favorite',
 'almond',
 'almond butter',
 'almond fl

In [38]:
tfidf_text.get_feature_names()#top 100 features of text using tfidf

['abil',
 'abl',
 'abl buy',
 'abl eat',
 'abl find',
 'abl get',
 'abl order',
 'abl purchas',
 'absolut',
 'absolut best',
 'absolut delici',
 'absolut favorit',
 'absolut love',
 'absolut no',
 'absorb',
 'acai',
 'accept',
 'access',
 'accid',
 'accident',
 'accompani',
 'accord',
 'account',
 'accur',
 'accustom',
 'acid',
 'acquir',
 'acquir tast',
 'across',
 'act',
 'activ',
 'actual',
 'actual like',
 'actual tast',
 'ad',
 'ad bonus',
 'ad littl',
 'ad sugar',
 'add',
 'add bit',
 'add extra',
 'add flavor',
 'add littl',
 'add milk',
 'add sugar',
 'add water',
 'addict',
 'addit',
 'address',
 'adequ',
 'adjust',
 'admit',
 'adopt',
 'ador',
 'adult',
 'advantag',
 'advertis',
 'advic',
 'advis',
 'affect',
 'afford',
 'afraid',
 'afternoon',
 'aftertast',
 'aftertast not',
 'afterward',
 'agav',
 'agav nectar',
 'age',
 'ago',
 'agre',
 'agre review',
 'ahead',
 'ahoy',
 'aid',
 'air',
 'airtight',
 'al',
 'ala',
 'alcohol',
 'ale',
 'alert',
 'allerg',
 'allergi',
 'allna

In [39]:
summary_final_bow_vec.get_shape()

(355850, 5000)

In [40]:
text_final_bow_vec.get_shape()

(355850, 5000)

In [41]:
summary_tfidf.get_shape()

(355850, 5000)

In [42]:
y=final['Score']
y.columns=['idx','Score']
y = pd.DataFrame(y)
y

Unnamed: 0,Score
138686,4
476617,5
22621,5
22620,2
157850,5
...,...
178145,5
173675,5
204727,5
5259,5


In [43]:
y.Score.loc[y.Score<3]=0
y.Score.loc[y.Score>3]=1
y

Unnamed: 0,Score
138686,1
476617,1
22621,1
22620,0
157850,1
...,...
178145,1
173675,1
204727,1
5259,1


In [44]:
# Train test split
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest = train_test_split(text_tfidf,y,train_size=0.7,random_state=42)

In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score as score
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

params = {'alpha':range(1,15)}
f1score = make_scorer(score)

clf = MultinomialNB()
clf2 = RandomizedSearchCV(clf,params,scoring=f1score,verbose=5,random_state=42)
#clf.fit(text_tfidf,y)

In [46]:
clf2.fit(xtrain,ytrain)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.932, total=   0.1s
[CV] alpha=10 ........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ............................ alpha=10, score=0.934, total=   0.1s
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.933, total=   0.1s
[CV] alpha=10 ........................................................
[CV] ............................ alpha=10, score=0.932, total=   0.1s
[CV] alpha=10 ........................................................


  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
  return f(**kwargs)


[CV] ............................ alpha=10, score=0.934, total=   0.1s
[CV] alpha=12 ........................................................
[CV] ............................ alpha=12, score=0.931, total=   0.1s
[CV] alpha=12 ........................................................
[CV] ............................ alpha=12, score=0.932, total=   0.1s
[CV] alpha=12 ........................................................


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] ............................ alpha=12, score=0.931, total=   0.1s
[CV] alpha=12 ........................................................
[CV] ............................ alpha=12, score=0.930, total=   0.1s
[CV] alpha=12 ........................................................
[CV] ............................ alpha=12, score=0.931, total=   0.1s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.940, total=   0.1s
[CV] alpha=1 .........................................................


  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=1, score=0.941, total=   0.1s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.940, total=   0.1s
[CV] alpha=1 .........................................................
[CV] ............................. alpha=1, score=0.940, total=   0.1s
[CV] alpha=1 .........................................................


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=1, score=0.941, total=   0.1s
[CV] alpha=13 ........................................................
[CV] ............................ alpha=13, score=0.930, total=   0.1s
[CV] alpha=13 ........................................................
[CV] ............................ alpha=13, score=0.931, total=   0.1s
[CV] alpha=13 ........................................................

  return f(**kwargs)
  return f(**kwargs)



[CV] ............................ alpha=13, score=0.931, total=   0.1s
[CV] alpha=13 ........................................................
[CV] ............................ alpha=13, score=0.930, total=   0.1s


  return f(**kwargs)
  return f(**kwargs)


[CV] alpha=13 ........................................................
[CV] ............................ alpha=13, score=0.931, total=   0.1s
[CV] alpha=6 .........................................................
[CV] ............................. alpha=6, score=0.936, total=   0.1s
[CV] alpha=6 .........................................................


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=6, score=0.937, total=   0.1s
[CV] alpha=6 .........................................................
[CV] ............................. alpha=6, score=0.936, total=   0.1s
[CV] alpha=6 .........................................................
[CV] ............................. alpha=6, score=0.936, total=   0.1s


  return f(**kwargs)
  return f(**kwargs)


[CV] alpha=6 .........................................................
[CV] ............................. alpha=6, score=0.937, total=   0.1s
[CV] alpha=9 .........................................................


  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=9, score=0.933, total=   0.1s
[CV] alpha=9 .........................................................
[CV] ............................. alpha=9, score=0.934, total=   0.1s
[CV] alpha=9 .........................................................
[CV] ............................. alpha=9, score=0.934, total=   0.1s
[CV] alpha=9 .........................................................


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=9, score=0.933, total=   0.1s
[CV] alpha=9 .........................................................
[CV] ............................. alpha=9, score=0.934, total=   0.1s
[CV] alpha=3 .........................................................
[CV] ............................. alpha=3, score=0.938, total=   0.1s
[CV] alpha=3 .........................................................
[CV] ............................. alpha=3, score=0.939, total=   0.1s
[CV] alpha=3 .........................................................


  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=3, score=0.939, total=   0.1s
[CV] alpha=3 .........................................................
[CV] ............................. alpha=3, score=0.938, total=   0.1s
[CV] alpha=3 .........................................................
[CV] ............................. alpha=3, score=0.940, total=   0.1s
[CV] alpha=2 .........................................................


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=2, score=0.939, total=   0.1s
[CV] alpha=2 .........................................................
[CV] ............................. alpha=2, score=0.940, total=   0.1s
[CV] alpha=2 .........................................................


  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=2, score=0.939, total=   0.1s
[CV] alpha=2 .........................................................
[CV] ............................. alpha=2, score=0.939, total=   0.1s
[CV] alpha=2 .........................................................
[CV] ............................. alpha=2, score=0.940, total=   0.1s


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] alpha=14 ........................................................
[CV] ............................ alpha=14, score=0.929, total=   0.1s
[CV] alpha=14 ........................................................
[CV] ............................ alpha=14, score=0.930, total=   0.1s
[CV] alpha=14 ........................................................

  return f(**kwargs)
  return f(**kwargs)



[CV] ............................ alpha=14, score=0.930, total=   0.1s
[CV] alpha=14 ........................................................
[CV] ............................ alpha=14, score=0.929, total=   0.1s
[CV] alpha=14 ........................................................


  return f(**kwargs)
  return f(**kwargs)


[CV] ............................ alpha=14, score=0.930, total=   0.1s
[CV] alpha=5 .........................................................
[CV] ............................. alpha=5, score=0.936, total=   0.1s
[CV] alpha=5 .........................................................
[CV] ............................. alpha=5, score=0.938, total=   0.1s
[CV] alpha=5 .........................................................


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[CV] ............................. alpha=5, score=0.937, total=   0.1s
[CV] alpha=5 .........................................................
[CV] ............................. alpha=5, score=0.937, total=   0.1s
[CV] alpha=5 .........................................................
[CV] ............................. alpha=5, score=0.938, total=   0.1s


  return f(**kwargs)
  return f(**kwargs)
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    5.2s finished
  return f(**kwargs)


RandomizedSearchCV(estimator=MultinomialNB(),
                   param_distributions={'alpha': range(1, 15)}, random_state=42,
                   scoring=make_scorer(f1_score), verbose=5)

In [47]:
clf2.best_params_

{'alpha': 1}

In [48]:
# Texting Phase
# Clean the review text by removing stop words, punctuations etc.
text="""bad taste. won't reccomend this to anyone"""
text= removeHTML(text)#removes html tags and attributes
text= subsitute(text)#converts short words to normal words
text= removePunctuation(text).strip()#removes punctuation and alphanumeric words
text = ' '.join(ele for ele in text.split() if ele not in stopwords)
test_review=text.strip()
test_review

'bad taste not reccomend anyone'

In [49]:
# Converting the review to its stem form
words = test_review.split()
test_review = ' '.join([snbstemmer.stem(w) for w in words])
test_review

'bad tast not reccomend anyon'

In [50]:
test = [test_review]
test= tfidf_text.transform(test)
test

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [70]:
a=clf4.predict_proba(test)
a[0][1]

0.050352998456587215

In [73]:
s = "{:.2f}".format(a[][0])
s

SyntaxError: invalid syntax (<ipython-input-73-cc88ddefbfc3>, line 1)

In [52]:
yhat = clf2.predict(xtest)

In [53]:
result = pd.DataFrame(yhat)
result

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1
...,...
106751,1
106752,1
106753,1
106754,1


In [54]:
score(ytest,yhat)

0.9398788972725651

In [55]:
from sklearn.linear_model import LogisticRegression

clf3= LogisticRegression()
params={'C':[3,4,5,6,7,8]}
clf4 = RandomizedSearchCV(clf3,params,scoring=f1score,random_state=42,verbose=10)

In [56]:
clf4.fit(xtrain,ytrain)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  return f(**kwargs)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] C=3 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=3, score=0.961, total=   2.9s
[CV] C=3 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.6s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=3, score=0.961, total=   2.8s
[CV] C=3 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.6s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=3, score=0.961, total=   3.0s
[CV] C=3 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.5s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=3, score=0.961, total=   2.9s
[CV] C=3 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.4s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=3, score=0.962, total=   2.9s
[CV] C=4 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   17.3s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=4, score=0.961, total=   2.9s
[CV] C=4 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   20.3s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=4, score=0.961, total=   3.0s
[CV] C=4 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   23.1s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=4, score=0.961, total=   2.9s
[CV] C=4 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   26.2s remaining:    0.0s
  return f(**kwargs)


[CV] ................................. C=4, score=0.961, total=   3.0s
[CV] C=4 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=4, score=0.962, total=   2.9s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=5, score=0.961, total=   3.0s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=5, score=0.961, total=   2.8s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=5, score=0.961, total=   3.0s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=5, score=0.961, total=   2.9s
[CV] C=5 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=5, score=0.962, total=   2.9s
[CV] C=6 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=6, score=0.961, total=   2.8s
[CV] C=6 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=6, score=0.961, total=   2.9s
[CV] C=6 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=6, score=0.961, total=   2.9s
[CV] C=6 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=6, score=0.961, total=   2.9s
[CV] C=6 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=6, score=0.962, total=   3.0s
[CV] C=7 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=7, score=0.961, total=   2.9s
[CV] C=7 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=7, score=0.961, total=   2.9s
[CV] C=7 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=7, score=0.961, total=   2.9s
[CV] C=7 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=7, score=0.961, total=   2.9s
[CV] C=7 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=7, score=0.961, total=   2.9s
[CV] C=8 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=8, score=0.961, total=   2.9s
[CV] C=8 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=8, score=0.961, total=   2.8s
[CV] C=8 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=8, score=0.960, total=   2.9s
[CV] C=8 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  return f(**kwargs)


[CV] ................................. C=8, score=0.962, total=   3.0s
[CV] C=8 .............................................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.5min finished
  return f(**kwargs)


[CV] ................................. C=8, score=0.961, total=   3.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


RandomizedSearchCV(estimator=LogisticRegression(),
                   param_distributions={'C': [3, 4, 5, 6, 7, 8]},
                   random_state=42, scoring=make_scorer(f1_score), verbose=10)

In [57]:
clf4.best_estimator_

LogisticRegression(C=3)

In [58]:
yhat_logit= clf4.predict(xtest)

In [59]:
score(ytest,yhat_logit)

0.9617690826330532

In [62]:
clf4 = LogisticRegression(C=3)
clf4.fit(xtrain,ytrain)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=3)

In [63]:
import pickle

tfidf_file = 'tfidf_vectorizer.pkl'
lr_file = 'LR_model.pkl'

with open(tfidf_file,'wb') as file:
    pickle.dump(tfidf_text,file)
    
with open(lr_file,'wb') as file:
    pickle.dump(clf4,file)

In [None]:
clf4.predict()