In [1]:
with open('./Wikipedia_articles/wiki_text_debugging.txt') as file:
    data=file.read()

In [2]:
data

'\'\'\'Debugging\'\'\' is the process of finding and resolving of defects that prevent correct operation of computer software or a system.  \n\nNumerous books have been written about debugging (see below: #Further reading|Further reading), as it involves numerous aspects, including interactive debugging, control flow, integration testing, Logfile|log files, monitoring (Application monitoring|application, System Monitoring|system), memory dumps, Profiling (computer programming)|profiling, Statistical Process Control, and special design tactics to improve detection while simplifying changes.\n\nOrigin\nA computer log entry from the Mark&nbsp;II, with a moth taped to the page\n\nThe terms "bug" and "debugging" are popularly attributed to Admiral Grace Hopper in the 1940s.[http://foldoc.org/Grace+Hopper Grace Hopper]  from FOLDOC While she was working on a Harvard Mark II|Mark II Computer at Harvard University, her associates discovered a moth stuck in a relay and thereby impeding operatio

In [3]:
from collections import Counter

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
data1 = word_tokenize(data)
Counter(data1).most_common(10)

[(',', 151),
 ('the', 134),
 ('.', 89),
 ('of', 81),
 ("''", 66),
 ('to', 62),
 ('a', 55),
 ('``', 47),
 ('and', 41),
 ('(', 40)]

## "Processing Data"

In [6]:
'a. Making everything lower case'
data2=[x.lower() for x in data1]
Counter(data2).most_common(10)

[(',', 151),
 ('the', 150),
 ('.', 89),
 ('of', 81),
 ("''", 66),
 ('to', 63),
 ('a', 60),
 ('``', 47),
 ('in', 44),
 ('and', 41)]

In [7]:
'b. Focussing only on alphabetical text'
data3=[x for x in data2 if x.isalpha()]
Counter(data3).most_common(10)

[('the', 150),
 ('of', 81),
 ('to', 63),
 ('a', 60),
 ('in', 44),
 ('and', 41),
 ('debugging', 40),
 ('for', 26),
 ('is', 25),
 ('or', 25)]

In [8]:
'c. Focussing only on relevant words'
from nltk.corpus import stopwords
data4=[x for x in data3 if x not in stopwords.words('english')]
Counter(data4).most_common(10)

[('debugging', 40),
 ('system', 19),
 ('software', 16),
 ('tools', 14),
 ('process', 12),
 ('computer', 12),
 ('used', 12),
 ('bug', 11),
 ('http', 11),
 ('term', 11)]

In [9]:
'd. Combining roots'
from nltk.stem import WordNetLemmatizer
data5=[WordNetLemmatizer().lemmatize(x) for x in data4]
Counter(data5).most_common(10)

[('debugging', 40),
 ('system', 25),
 ('bug', 17),
 ('software', 16),
 ('problem', 15),
 ('tool', 15),
 ('computer', 14),
 ('process', 13),
 ('term', 13),
 ('debugger', 13)]

In [10]:
from nltk.stem import LancasterStemmer
data5=[LancasterStemmer().stem(x) for x in data4]
Counter(data5).most_common(10)

[('debug', 58),
 ('us', 31),
 ('program', 27),
 ('system', 26),
 ('comput', 19),
 ('process', 17),
 ('bug', 17),
 ('softw', 16),
 ('problem', 15),
 ('tool', 15)]

In [11]:
from nltk.stem import PorterStemmer
data5=[PorterStemmer().stem(x) for x in data4]
Counter(data5).most_common(10)

[('debug', 45),
 ('system', 25),
 ('use', 23),
 ('program', 20),
 ('comput', 19),
 ('bug', 17),
 ('softwar', 16),
 ('problem', 15),
 ('tool', 15),
 ('process', 13)]

---
---
---

### "Named Entity recognition"
"NER-People, places, organizations, Dates, states, works of art and other categories. It can be used to answer who, what, when kind of queries"

In [12]:
from nltk import pos_tag

In [13]:
sentence='''In New Delhi, I like to ride the Metro to visit 'India Gate' and some restaurants rated well by Vir Sanghvi'''
a= word_tokenize(sentence)
b= pos_tag(a)
print(b)

[('In', 'IN'), ('New', 'NNP'), ('Delhi', 'NNP'), (',', ','), ('I', 'PRP'), ('like', 'VBP'), ('to', 'TO'), ('ride', 'VB'), ('the', 'DT'), ('Metro', 'NNP'), ('to', 'TO'), ('visit', 'VB'), ("'India", 'NNP'), ('Gate', 'NNP'), ("'", 'POS'), ('and', 'CC'), ('some', 'DT'), ('restaurants', 'NNS'), ('rated', 'VBN'), ('well', 'RB'), ('by', 'IN'), ('Vir', 'NNP'), ('Sanghvi', 'NNP')]


In [14]:
from nltk import ne_chunk

In [15]:
c=ne_chunk(b)
print(c)

(S
  In/IN
  (GPE New/NNP Delhi/NNP)
  ,/,
  I/PRP
  like/VBP
  to/TO
  ride/VB
  the/DT
  (ORGANIZATION Metro/NNP)
  to/TO
  visit/VB
  'India/NNP
  Gate/NNP
  '/POS
  and/CC
  some/DT
  restaurants/NNS
  rated/VBN
  well/RB
  by/IN
  (PERSON Vir/NNP Sanghvi/NNP))


### spacey, gensim, polyglot

---

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
from nltk.tokenize import PunktSentenceTokenizer, TweetTokenizer, MWETokenizer, SpaceTokenizer
from nltk import word_tokenize

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [18]:
df=pd.read_csv('d://PyD/Datasets/fake_or_real_news.csv')

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [21]:
df.text[1][:100]

'Google Pinterest Digg Linkedin Reddit Stumbleupon Print Delicious Pocket Tumblr \nThere are two funda'

In [22]:
y = df.label

In [None]:
from 

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], y, test_size=0.33, random_state=53)

In [26]:
X_train.head()

2576                                                     
1539    Report Copyright Violation Do you think there ...
5163    The election in 232 photos, 43 numbers and 131...
2615    Email Ever wonder what’s on the mind of today’...
4270    Wells Fargo is Rotting from the Top Down Wells...
Name: text, dtype: object

In [27]:
count_vectorizer = CountVectorizer(stop_words='english')

In [57]:
def token(text):
    tk =  SpaceTokenizer()
    tk1 = tk.tokenize(text)
    tk2 = MWETokenizer().tokenize(tk1)    
    return tk2

In [58]:
count_vectorizer = CountVectorizer(analyzer = 'word', stop_words='english', tokenizer= token, \
                                   ngram_range= (1,1), max_features=1000)

In [59]:
%%time
count_train = count_vectorizer.fit_transform(X_train)

Wall time: 15.6 s


In [60]:
count_test = count_vectorizer.transform(X_test)

In [61]:
print(count_vectorizer.get_feature_names()[:50])

['!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '...', '/', '0', '1', '10', '100', '11', '12', '14', '15', '16', '2', '20', '2008', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '25', '26', '28', '3', '30', '4', '40', '5', '50', '6', '7', '8', ':', ';', '?', '[', ']']


In [47]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df= 0.7, max_features=1000)

In [48]:
tfidf_train =tfidf_vectorizer.fit_transform(X_train)

In [49]:
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
# np.round(sorted(tfidf_vectorizer.idf_)[:-6:-1], 3)

In [50]:
tfidf_train.shape

(4244, 1000)

In [51]:
count_train.shape

(4244, 1000)

In [62]:
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

In [63]:
count_df.head()

Unnamed: 0,!,"""",$,%,&,',(,),*,",",...,york,young,,–,—,‘,’,“,”,…
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,5,...,0,0,0,0,3,2,10,3,3,0
4,0,0,0,0,0,0,1,1,0,17,...,0,0,0,0,4,0,12,0,0,0


In [54]:
tfidf_df.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,worse,worth,wouldn,wrong,wrote,year,years,yes,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.052281,0.0,0.0,0.0


In [64]:
nb_classifier =  MultinomialNB()

nb_classifier.fit(count_train, y_train)
pred = nb_classifier.predict(count_test)

score =  accuracy_score(y_test, pred)
print(score)

cm = confusion_matrix(y_test, pred)
print(cm)

0.8598756575801052
[[896 112]
 [181 902]]


In [65]:
nb_classifier.fit(tfidf_train, y_train)
pred = nb_classifier.predict(tfidf_test)
score = accuracy_score(y_test, pred,)
print(score)
cm = confusion_matrix(y_test, pred)
print(cm)

0.8479196556671449
[[893 115]
 [203 880]]


In [67]:
from sklearn.tree import DecisionTreeClassifier
dt  = DecisionTreeClassifier()

In [68]:
dt.fit(count_train, y_train)
pred = dt.predict(count_test)

score =  accuracy_score(y_test, pred)
print(score)

cm = confusion_matrix(y_test, pred)
print(cm)

0.8273553323768532
[[856 152]
 [209 874]]


In [69]:
cm = confusion_matrix(y_test, pred)
print(cm)

[[856 152]
 [209 874]]


In [70]:
dfcm = pd.DataFrame(cm, columns=['Pred_Fake', 'Pred_Real'],\
                    index = ['Actually_Fake', 'Actually_Real'] )

In [71]:
dfcm

Unnamed: 0,Pred_Fake,Pred_Real
Actually_Fake,856,152
Actually_Real,209,874
