<a href="https://colab.research.google.com/github/olinyoder2534/NLP_practice/blob/main/n_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#bi-gram
v = CountVectorizer(ngram_range = (2,2))
#single and bi-gram
#v = CountVectorizer(ngram_range = (1,2))

In [4]:
text = "That movie was amazing."

In [5]:
v.fit([text])
v.vocabulary_

{'that movie': 1, 'movie was': 0, 'was amazing': 2}

In [6]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [7]:
corpus[0]

'Thor ate pizza'

In [8]:
import spacy

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
def preprocess(text):
  doc = nlp(text)

  filtered_tokens = []

  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)

  return " ".join(filtered_tokens)

In [11]:
preprocess(corpus[0])

'thor eat pizza'

In [12]:
corpus_processed = [preprocess(i) for i in corpus]

In [13]:
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [14]:
v = CountVectorizer(ngram_range = (1,2))

In [15]:
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [16]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [17]:
df = pd.read_json("/content/news_dataset.json")

In [18]:
df.shape

(12695, 2)

In [19]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [20]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [21]:
from imblearn.under_sampling import RandomUnderSampler

In [22]:
rus = RandomUnderSampler(random_state=42)

In [23]:
df_balanced, y_resampled = rus.fit_resample(df[['text', 'category']], df['category'])

df_balanced = pd.DataFrame(df_balanced, columns=['text', 'category'])
df_balanced.head()

Unnamed: 0,text,category
0,How to Develop the Next Generation of Innovato...,BUSINESS
1,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS
2,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS
3,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS
4,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS


In [24]:
df_balanced.category.value_counts()

category
BUSINESS    1381
CRIME       1381
SCIENCE     1381
SPORTS      1381
Name: count, dtype: int64

In [28]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_num'] = df_balanced['category'].map({
    'BUSINESS': 0,
    'SPORTS': 1,
    'CRIME': 2,
    'SCIENCE': 3
})

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'],df_balanced['category_num'], stratify=df_balanced['category_num'])

In [30]:
X_train.head()

2694    Convicted Killer Guillermo Aillon Booted From ...
1272    The New Black Friday Means Lines But Less Fren...
3179    Warm Winter Weather Making Some Animals Vulner...
1402    Wrong-Way Driver Crashes Into Tanker, Igniting...
3402    Are Nasty Comments Like These Keeping Women Ou...
Name: text, dtype: object

In [31]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4143,), (1381,), (4143,), (1381,))

In [40]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

Just using a simple BoW model

In [34]:
clfNB = Pipeline([
  ('vectorizer_bow', CountVectorizer()),
  ('multinomial NB', MultinomialNB())
])

In [35]:
clfNB.fit(X_train, y_train)

In [37]:
y_predNB = clfNB.predict(X_test)

In [39]:
print(classification_report(y_test, y_predNB))

              precision    recall  f1-score   support

           0       0.77      0.87      0.82       345
           1       0.92      0.84      0.88       345
           2       0.84      0.91      0.88       346
           3       0.90      0.79      0.84       345

    accuracy                           0.85      1381
   macro avg       0.86      0.85      0.85      1381
weighted avg       0.86      0.85      0.85      1381



In [41]:
clfRF = Pipeline([
  ('vectorizer_bow', CountVectorizer()),
  ('RF', RandomForestClassifier())
])

In [42]:
clfRF.fit(X_train, y_train)

In [43]:
y_predRF = clfRF.predict(X_test)

In [44]:
print(classification_report(y_test, y_predRF))

              precision    recall  f1-score   support

           0       0.71      0.72      0.72       345
           1       0.80      0.72      0.76       345
           2       0.81      0.87      0.84       346
           3       0.76      0.76      0.76       345

    accuracy                           0.77      1381
   macro avg       0.77      0.77      0.77      1381
weighted avg       0.77      0.77      0.77      1381



Using n-gram(1,2)

In [45]:
clfNB1 = Pipeline([
  ('vectorizer', CountVectorizer(ngram_range = (1,2))),
  ('multinomial NB', MultinomialNB())
])

clfNB1.fit(X_train, y_train)

y_predNB1 = clfNB1.predict(X_test)

print(classification_report(y_test, y_predNB1))

              precision    recall  f1-score   support

           0       0.69      0.90      0.78       345
           1       0.95      0.78      0.86       345
           2       0.85      0.88      0.86       346
           3       0.90      0.76      0.82       345

    accuracy                           0.83      1381
   macro avg       0.85      0.83      0.83      1381
weighted avg       0.85      0.83      0.83      1381



In [48]:
df_balanced['textNew'] = df_balanced.text.apply(preprocess)

In [49]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df_balanced['textNew'],df_balanced['category_num'], stratify=df_balanced['category_num'])

In [50]:
clfNB2 = Pipeline([
  ('vectorizer', CountVectorizer(ngram_range = (1,2))),
  ('multinomial NB', MultinomialNB())
])

clfNB2.fit(X_train1, y_train1)

y_predNB2 = clfNB2.predict(X_test1)

print(classification_report(y_test1, y_predNB2))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       346
           1       0.90      0.85      0.88       345
           2       0.86      0.92      0.89       345
           3       0.90      0.85      0.87       345

    accuracy                           0.87      1381
   macro avg       0.87      0.87      0.87      1381
weighted avg       0.87      0.87      0.87      1381



In [51]:
data = pd.read_csv('/content/Fake_Real_Data.csv')

In [52]:
data.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [55]:
data.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [56]:
data['textNew'] = data.Text.apply(preprocess)

In [62]:
target = {'Real': 0, 'Fake': 1}

data['label_num'] = data['label'].map({
    'Real': 0,
    'Fake': 1
})

In [63]:
data.head()

Unnamed: 0,Text,label,textNew,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,Trump Surrogate BRUTALLY Stabs Pathetic vide...,1
1,U.S. conservative leader optimistic of common ...,Real,U.S. conservative leader optimistic common gro...,0
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,trump propose U.S. tax overhaul stir concern d...,0
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,Court Forces Ohio allow million illegally pu...,1
4,Democrats say Trump agrees to work on immigrat...,Real,Democrats Trump agree work immigration bill wa...,0


In [64]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data['textNew'], data['label_num'])

In [65]:
X_train2.head()

7474    Orange County Texas start curfew Saturday Harv...
7502      Trump Personal Lawyer Major Legal prepare Ru...
343     development aid defense spending NATO Stoltenb...
2769    know U.S. probe russian meddling 2016 election...
9619      Congressional Black Caucus Formulates Battle...
Name: textNew, dtype: object

In [66]:
clfNB3 = Pipeline([
  ('vectorizer', CountVectorizer(ngram_range = (1,3))),
  ('multinomial NB', MultinomialNB())
])

clfNB3.fit(X_train2, y_train2)

y_predNB3 = clfNB3.predict(X_test2)

print(classification_report(y_test2, y_predNB3))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1282
           1       0.99      0.98      0.99      1193

    accuracy                           0.99      2475
   macro avg       0.99      0.99      0.99      2475
weighted avg       0.99      0.99      0.99      2475



In [67]:
from sklearn.linear_model import LogisticRegression

clfLG = Pipeline([
  ('vectorizer', CountVectorizer(ngram_range = (1,3))),
  ('logistic regression', LogisticRegression())
])

clfLG.fit(X_train2, y_train2)

y_predLG = clfLG.predict(X_test2)

print(classification_report(y_test2, y_predLG))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1282
           1       1.00      1.00      1.00      1193

    accuracy                           1.00      2475
   macro avg       1.00      1.00      1.00      2475
weighted avg       1.00      1.00      1.00      2475

