In [1]:
import numpy as np

# **CountVectorizer**

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# COUNT VECTORIZER TAKES AN ARRAY OF TEXT DATA, WHICH CAN BE DOCUMENTS OR SENTENCES AND CONSTRUCTS THE BAG-OF-WORDS MODEL FOR US
# Note CountVectorizer is an  instance of some type. We need to initialize that instance first before we apply the .fit_transform(docs) Alemi 04/07/2020
count=CountVectorizer()


In [4]:
import numpy as np 
# Note there are 4 lines in Docs which is really 3 documents. Each document is separated by a ",". So the frequency of occurance for the word "is" = 3 for the 3rd document
# 1 for the first and 1 for the second document. The word "is" occurs max = 3. It also occurs in every document. 
docs = np.array(['The sun is shining on Zhongping',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet,'
                 'and one and one is two'])


In [5]:
# Transforming Words into Feature Vectors 
bag = count.fit_transform(docs)

In [6]:
print(count.vocabulary_)

{'the': 7, 'sun': 5, 'is': 1, 'shining': 4, 'on': 2, 'zhongping': 10, 'weather': 9, 'sweet': 6, 'and': 0, 'one': 3, 'two': 8}


In [7]:
# Each index position in the feature vectors shown here corresponds to the integer values that are stored as dictionary items in the CountVectorizer vocabulary
# For example, the first feature at index position 0 resembles the count of the word 'and' which only occurs in the last document. Zhongping occurs on index 10,
# recall our index starts with zero, and it (Zhongping) only occurs once! in the first document. We call these feature vectors: Raw Term Frequencies: tf(t,d) -
# the number of times a term t occurs in a document d. The sequence of items in the bag-of-words model we just created is also called the 1-gram or unigram model -
# Each item or token in the vocabulary represents a single word. More generally, the contigious sequences of items in NLP - words, letters, or symbols - are also called
# n-grams. 
# In 1-gram, we will have 1-gram: "The", "Sun", "is", "Shining"
# In 2-gram: "The sun", "Sun is", "is Shining"
# We could have initialized our CountVectorizer instance with ngram_range=(2,2)
print(bag.toarray())

[[0 1 1 0 1 1 0 1 0 0 1]
 [0 1 0 0 0 0 1 1 0 1 0]
 [2 3 0 2 1 1 1 2 1 1 0]]


In [8]:
# Let us capture the inverse of document frequency: idf(t,d) = log n/(1+df(d,t)) - where n is the number of our documents. Log is a weight and f(d,t) was explained before!
# Another transformer from scikit-learn library is TfidTransformer class, that takes the term frequencies from CountVectorizer class as input and transforms them into tf-idfs
!pip show tensorflow


Name: tensorflow
Version: 2.1.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /Users/piruzalemi/opt/anaconda3/lib/python3.7/site-packages
Requires: wheel, keras-applications, opt-einsum, numpy, absl-py, six, tensorboard, scipy, astor, keras-preprocessing, gast, tensorflow-estimator, wrapt, termcolor, google-pasta, grpcio, protobuf
Required-by: 


In [9]:
# https://pypi.python.org/pypi/libarchive
!apt-get -qq install -y libarchive-dev && pip install -q -U libarchive
import libarchive

/bin/sh: apt-get: command not found


In [10]:
# from sklearn.feature_extraction.text import TfidTransformer
!apt-get -qq install -y libfluidsynth1

/bin/sh: apt-get: command not found


In [11]:
#from sklearn.feature_extraction.text import TfidTransformer
!pip install tf-nightly

Collecting gast==0.3.3
  Using cached gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
[31mERROR: tensorflow 2.1.0 has requirement gast==0.2.2, but you'll have gast 0.3.3 which is incompatible.[0m
Installing collected packages: gast
  Attempting uninstall: gast
    Found existing installation: gast 0.2.2
    Uninstalling gast-0.2.2:
      Successfully uninstalled gast-0.2.2
Successfully installed gast-0.3.3


In [12]:
#from sklearn.feature_extraction.text import TfidTransformer
from sklearn.feature_extraction.text import TfidfTransformer

In [13]:
#  Tf is “n” (natural) by default, “l” (logarithmic) when sublinear_tf=True. 
#  Idf is “t” when use_idf is given, “n” (none) otherwise. 
#  Normalization is “c” (cosine)
#  when norm='l2', “n” (none) when norm=None.
tfidf = TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)

In [14]:
np.set_printoptions(precision=2)

In [15]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.3  0.51 0.   0.39 0.39 0.   0.3  0.   0.   0.51]
 [0.   0.43 0.   0.   0.   0.   0.56 0.43 0.   0.56 0.  ]
 [0.5  0.45 0.   0.5  0.19 0.19 0.19 0.3  0.25 0.19 0.  ]]


In [16]:
# The word 'is' has a frequency of 3 (tf=3) in the third document, and the document frequency of this term is also 3 since the term "is" occurs in all three documents (df=3)
#. Thus, we can calculate  the inverse document frequency as follows: idf("is",d3) = log(1+3)/(1+3) = 0, and the tf-idf (termFrequency-inverseDocumentFrequency) 
#  tf-idf("is",d3)=3 * (0+1) =3. Note this value is different than the above derived tf-idf(d3), as the above values in the final step is L2-normalized.
# lets clean our data by excluding unwanted characters!

In [17]:
#import pyprind
import tarfile
!pip install -q matplotlib-venn

In [18]:
#with tarfile.open('https://www.kaggle.com/aaron7sun/stocknews#Combined_News_DJIA.csv') as tar:
#  tar.extractall()
import numpy as np

In [19]:
np.random.seed(0)
!pip install pyprind
import pyprind
import pandas as pd
import os



# **Access Kaggle**


In [20]:
# Install Kaggle library
!pip install -q kaggle
!pip install kaggle



In [21]:
#!mkdir ~/.kaggle
#!cp Downloads/kaggle.json ~/.kaggle/kaggle.json

In [22]:
!kaggle datasets download -d aaron7sun/stocknews

stocknews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [23]:
!kaggle datasets download -d aaron7sun/stocknews

stocknews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [24]:
ls

Alemi2_final_project.ipynb
[31mCombined_News_DJIA.csv[m[m*
Combined_News_DJIA.xls
D11ScikitLearnBayes.pdf
Final_NLTK_Latent.ipynb
Linear_Visualization_Graph.ipynb
Naive_Visualization_Graph-Copy1.ipynb
Naive_Visualization_Graph.ipynb
[31mRedditNews.csv[m[m*
TensorFlow.ipynb
Tony Holdroyd - Tensorflow 2.0 Quick Start Guide (2019, Packt Publishing).pdf
Untitled.ipynb
alemi2_NLTK.ipynb
alemi_NLTK.ipynb
alemi_final_project-Copy1.ipynb
alemi_final_project.ipynb
alemi_final_project2.ipynb
kaggle.json
plot_logistic.ipynb
report.txt
[34mstocknews[m[m/
stocknews.zip
[31mupload_DJIA_table.csv[m[m*


In [25]:
#pd.read_csv("stocknews/RedditNews.csv")
#!unzip stocknews.zip -d ~/.kaggle

In [26]:
#df=pd.read_csv('stocknews.zip', compression='zip', header=0, sep=',', quotechar='"')
rNews=pd.read_csv('RedditNews.csv')
rNews.head()


Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [27]:
dNews=pd.read_csv('Combined_News_DJIA.csv')
dNews.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...


In [28]:
# we need to clean yje "'" and \ and all punctuation marks like b" which are HTML makups
# to accomplish this, we will use Python's regular expression (regex) library re as shown below
import re


In [29]:
# We will remove all the non-word characters from the text via the regex [\w] + and convert the text into lower characters
def preprocessor(text):
    
    text = (re.sub('[\W]+',' ', text.lower() + ''))
    text = re.sub("b ",' ',text)
    return text

In [30]:
dNews['Top1'] = dNews['Top1'].apply(preprocessor)

In [31]:
dNews['Top1'].head()

0     georgia downs two russian warplanes as countr...
1     why wont america and nato help us if they won...
2     remember that adorable 9 year old who sang at...
3     u s refuses israel weapons to attack iran rep...
4     all the experts admit that we should legalise...
Name: Top1, dtype: object

In [32]:
dNews.loc[0,'Top1'][0:]

' georgia downs two russian warplanes as countries move to brink of war '

# **Tokenization**

In [33]:
def tokenizer(text):
  return text.split()


In [34]:
tokenizer(dNews['Top1'][0])

['georgia',
 'downs',
 'two',
 'russian',
 'warplanes',
 'as',
 'countries',
 'move',
 'to',
 'brink',
 'of',
 'war']

In [35]:
# install NTLK see www.ntlk.org/book/ in advanced applications in NLP
#  We will use Porter stemmer's algorim - 1979, which is word stemming
!pip install nltk



In [36]:
from nltk.stem.porter import PorterStemmer

In [37]:
porter = PorterStemmer()

In [38]:
def tokenizer_porter(text):
  return [porter.stem(word) for word in text.split()]

In [39]:
tokenizer_porter(dNews['Top1'][0])

['georgia',
 'down',
 'two',
 'russian',
 'warplan',
 'as',
 'countri',
 'move',
 'to',
 'brink',
 'of',
 'war']

In [40]:
# Stopwords are simply those words that are extremely common but bear no use for our analysis!
# like is, and, has, like,,,, we will use 127 stopwords already available in the NTLK library which we get by downloading it
import nltk


In [41]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/piruzalemi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
# Load and apply the stopwards after download
from nltk.corpus import stopwords

In [43]:
stop = stopwords.words('english')

In [44]:
[w for w in tokenizer_porter(dNews['Top1'][0]) if w not in stop]

['georgia', 'two', 'russian', 'warplan', 'countri', 'move', 'brink', 'war']

In [45]:
# Training a logistic regression model for Document Classification
# Here we train a logistic 

# **Training a Logistic Regression on Document Classification - Alemi 04/07/2020**



In [46]:
# we will use 
#dNews.head()

# Select the Test & Train Data

In [47]:
#. This should run on cleaned data, Note i have solely cleaned only row [0], we have to run the above code on all rows!!!!!!!
#. So this code is solely is for a placeholder, after the data is fully cleaned, then we run this part...Alemi 04/07/2020
X_train = dNews.loc[:1000,"Top1"].values
X_train.shape

(1001,)

In [51]:
X_train_tf =tfidf.fit_transform(vectorizer(X_train))
X_train_tf.shape

NameError: name 'vectorizer' is not defined

In [49]:
y_train= dNews.loc[:1000,"Label"].values
y_train.shape

(1001,)

In [388]:
#test = old_count_vectorizer.transform(test)
#tfidf_data = old_tfidf_transformer.transform(test)

In [389]:
X_test = dNews.loc[1001:,"Top1"].values
X_test.shape

(988,)

In [390]:
X_test_tf =tfidf.transform(vectorizer.transform(X_test))
X_test_tf.shape

(988, 5040)

In [391]:
y_test= dNews.loc[1001:,"Label"].values
y_test.shape

(988,)

In [392]:
from sklearn.model_selection import train_test_split
# X,y =  dNews.iloc[:, 2:].values, dNews.iloc[:, 1].values


In [393]:
# X_train, X_test, y_train, y_test =\
#    train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

# Imports from SkLearn

In [58]:
# like our class exercise we run the GridSearch to find the optimal set of parameters
from sklearn.model_selection import GridSearchCV

In [59]:
from sklearn.pipeline import Pipeline

In [60]:
from sklearn.linear_model import LogisticRegression

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)

# **Clean the Text Data**

In [63]:
dNews['Top1']

0        georgia downs two russian warplanes as countr...
1        why wont america and nato help us if they won...
2        remember that adorable 9 year old who sang at...
3        u s refuses israel weapons to attack iran rep...
4        all the experts admit that we should legalise...
                              ...                        
1984    barclays and rbs shares suspended from trading...
1985    2 500 scientists to australia if you want to s...
1986                     explosion at airport in istanbul
1987    jamaica proposes marijuana dispensers for tour...
1988    a 117 year old woman in mexico city finally re...
Name: Top1, Length: 1989, dtype: object

In [64]:
#import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/piruzalemi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
from nltk.corpus import stopwords

In [66]:
stop=stopwords.words('english') + [None]

In [67]:
tokenizer_porter(dNews['Top1'][0])

['georgia',
 'down',
 'two',
 'russian',
 'warplan',
 'as',
 'countri',
 'move',
 'to',
 'brink',
 'of',
 'war']

In [68]:
[w for w in tokenizer_porter(dNews['Top1'][0]) if w not in stop]

['georgia', 'two', 'russian', 'warplan', 'countri', 'move', 'brink', 'war']

# **Transforming words into feature vectors**

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
cv = CountVectorizer(stop_words='english')

In [71]:
#count=CountVectorizer

In [72]:
#docs=np.array(dNews['Top1'])

In [73]:
vectorizer = TfidfVectorizer(stop_words='english')

### Bag of Words indices +  Vectorized + stop_words

In [74]:
# This is my bag of words based on the word count in the respective document, takes an array of texts and counts its words
bagX_test=vectorizer.fit_transform(X_test)
print(vectorizer.vocabulary_)


NameError: name 'X_test' is not defined

In [75]:
bagX_train=vectorizer.fit_transform(X_train)
print(vectorizer.vocabulary_)



In [76]:
bagX_test.shape, bagX_train.shape

NameError: name 'bagX_test' is not defined

In [77]:
# This is my bag of words based on the word count in the respective document, takes an array of texts and counts its words
bag=vectorizer.fit_transform(dNews['Top1'])
print(vectorizer.vocabulary_)
bag.shape




(1989, 7527)

In [78]:
bag.shape

(1989, 7527)

In [79]:
print(bag.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [80]:
bagArray=bag.toarray()

In [81]:
bagArray.shape

(1989, 7527)

In [82]:
# Create a length column to be used as a future feature 
#for i in range(0, len(dNews)):
#    dNews.length[i] = len(dNews.iloc[i])
#dNews.head()

In [83]:
from sklearn.feature_extraction.text import TfidfTransformer

In [84]:
tfidf=TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)

In [85]:
np.set_printoptions(precision=2)

### TfidTransformer class, takes the raw term frequencies from TfidVectorizer class as input and
###  transforms them into tf-idfs


In [86]:
print(tfidf.fit_transform(vectorizer.fit_transform(dNews['Top1'])).toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [87]:
dNews_tf =tfidf.fit_transform(vectorizer.fit_transform(dNews['Top1']))
dNews_tf.shape

(1989, 7527)

In [88]:
#X_train_tf =tfidf.fit_transform(vectorizer.fit_transform(X_train))
X_train_tf.shape

NameError: name 'X_train_tf' is not defined

In [89]:
#X_test_tf =tfidf.fit_transform(vectorizer.fit_transform(X_test))
X_test_tf.shape

NameError: name 'X_test_tf' is not defined

In [90]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

NameError: name 'X_test' is not defined

In [91]:
# Fit the classifier
from sklearn import linear_model

In [92]:
# The expit function, also known as the logistic sigmoid function, is defined as expit(x) = 1/(1+exp(-x)). 
# It is the inverse of the logit function.
from scipy.special import expit

In [93]:
#-------------------------------------------------------------------------------------
X=X_test_tf
y=y_test
#-------------------------------------------------------------------------------------
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(X, y)
clf.coef_
clf.score(X, y)

NameError: name 'X_test_tf' is not defined

In [94]:
#-------------------------------------------------------------------------------------
X=dNews_tf
#X=X_train_tf
y=dNews['Label']
#-------------------------------------------------------------------------------------
clf = linear_model.LogisticRegression(C=1e5)
clf.fit(X, y)
clf.coef_
#clf.score(X, y)



array([[ -2.92, -12.82,  -4.89, ...,  -3.64,  -4.94,   5.23]])

In [95]:
clf

LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [96]:
import numpy as np
import matplotlib.pyplot as plt

# Naive Bayes Model: BernoulliNB

In [97]:
from sklearn.naive_bayes import BernoulliNB

In [98]:
from sklearn.feature_extraction.text import CountVectorizer

In [99]:
vectorizer = CountVectorizer()

In [100]:
#X = vectorizer.fit_transform(X_train)

In [101]:
naive_bayes = BernoulliNB()

In [102]:
# from sklearn.model_selection import train_test_split

In [103]:
# binarizefloat or None, optional (default=0.0)
# Threshold for binarizing (mapping to booleans) of sample features. 
# If None, input is presumed to already consist of binary vectors.
naive_bayes.fit(X_train_tf, y_train)
#X.shape

NameError: name 'X_train_tf' is not defined

In [104]:
#X_test2 = naive_bayes.transform(X_test)


In [105]:
#clf = BernoulliNB()
#clf.fit(X, y_train)

In [106]:
predictions = naive_bayes.predict(X_test_tf)
naive_bayes.score(X_test_tf, y_test)

NameError: name 'X_test_tf' is not defined

In [107]:
naive_bayes.score(X_train_tf, y_train)

NameError: name 'X_train_tf' is not defined

In [109]:
predictions

NameError: name 'predictions' is not defined

In [110]:
# and plot the result
plt.figure(1, figsize=(4, 3))
plt.clf()
plt.scatter(X.ravel(), y, color='black', zorder=20)
X_test = np.linspace(-5, 10, 300)

loss = expit(X_test * clf.coef_ + clf.intercept_).ravel()
plt.plot(X_test, loss, color='red', linewidth=3)

ols = linear_model.LinearRegression()
ols.fit(X, y)
plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1)
plt.axhline(.5, color='.5')

plt.ylabel('y')
plt.xlabel('X')
plt.xticks(range(-5, 10))
plt.yticks([0, 0.5, 1])
plt.ylim(-.25, 1.25)
plt.xlim(-4, 10)
plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
           loc="lower right", fontsize='small')
plt.tight_layout()
plt.show()

AttributeError: ravel not found

<Figure size 288x216 with 0 Axes>

In [111]:
[8]:  # Show label and resulting features
cleaned(['Label', 'Top1'])

SyntaxError: invalid syntax (<ipython-input-111-403df2f6ffc0>, line 1)

In [112]:
param_grid =[{'vect_ngram_range': [(1,1)],
              'vect_stop_words': [stop, None],
              'vect_topkenizer': [tokenizer,tokenizer_porter],
              'clf_penalty':['l1','l2'],
              'clf_C':[1.0,10.0,100.0]}]
              

In [113]:
lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])

In [114]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=1)

In [115]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter clf_C for estimator Pipeline(memory=None,
         steps=[('vect',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=0, solver='warn',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.