# Model Training for Score Relevance

In [1]:
import pandas as pd
import numpy as np

## Training a simple model 

In [2]:
import seaborn as sns
from nltk.stem import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re

In [3]:
final_df = pd.read_csv('all_score_training.csv')

In [4]:
final_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,headlines,target
0,0,0,0.0,‘Obviously I don’t think women are any less in...,1.0
1,1,1,1.0,Layoffs hit crypto and real estate tech partic...,1.0
2,2,2,2.0,A brief recap of one of the worst weeks in cry...,1.0
3,3,3,3.0,Wells Fargo economist likens crypto collapse t...,1.0
4,4,4,4.0,Crypto's Excruciating Week Has Traders Bracing...,1.0


In [5]:
c = final_df['target'].value_counts()
p = final_df['target'].value_counts(normalize=True)
pd.concat([c,p], axis=1, keys=['counts', '%'])
# final_df['target'].value_counts()

Unnamed: 0,counts,%
0.0,1187,0.684939
1.0,546,0.315061


In [6]:
final_df.drop(['Unnamed: 0'],axis=1,inplace=True)
final_df.drop(['Unnamed: 0.1'],axis=1,inplace=True)
final_df.drop(['Unnamed: 0.1.1'],axis=1,inplace=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(final_df['headlines'],final_df['target'], stratify=final_df['target'])
X_train.head(4)

1635    Paramount+, Starz, Showtime & More Are on Sale...
1583    Couple Slain in Parade Nightmare in Front of T...
979     Coinbase Stock Tumbles, Robinhood Slumps As Bi...
1720    4 bears killed in Alaska campground reserved f...
Name: headlines, dtype: object

In [8]:
# Building a Naive Bayes Classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [9]:
model.fit(X_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [10]:
y_pred = model.predict(X_test)
score_pred = model.predict_proba(X_test)[:,1]

In [11]:
accuracy_score(y_test, y_pred)

0.8594470046082949

Let's apply some techniques to enhance the result:
- all lowercase
- removal of points
- removal of stopwords
- lemmatization

#### Testing the first 3 techniques 

In [12]:
final_df['headlines'] = final_df['headlines'].map(lambda x: x.lower())

In [13]:
final_df['headlines'] = final_df['headlines'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))

In [14]:
nltk.download('stopwords')
english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [16]:
final_df['titles'] = remove_stop_words(final_df['headlines'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(final_df['titles'],final_df['target'], stratify=final_df['target'])
X_train.head(4)

907     independent al gross says ending alaska house bid
1162    distracted putin tumble new bloodbath official...
1059          gas prices coming without biden tax holiday
821     denmark keeps f 16 fighter jets flying due rus...
Name: titles, dtype: object

In [18]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
score_pred = model.predict_proba(X_test)[:,1]

In [19]:
accuracy_score(y_test, y_pred)

0.9101382488479263

In [20]:
confusion_matrix(y_test, y_pred)

array([[293,   4],
       [ 35, 102]], dtype=int64)

### Applying Lemmatization 

In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
#We need to use the final_df['titles'] to  lemmatize each word:
lemmatizer = WordNetLemmatizer()

In [23]:
#Function to apply for each word the proper lemmatization.
def lemmetize_titles(words):
    a = []
    tokens = word_tokenize(words)
    for token in tokens:
        lemmetized_word = lemmatizer.lemmatize(token)
        a.append(lemmetized_word)
    lemmatized_title = ' '.join(a)
    return lemmatized_title 

In [24]:
final_df['lemmetized_titles'] = final_df['titles'].apply(lemmetize_titles)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(final_df['lemmetized_titles'],final_df['target'], stratify=final_df['target'])
X_train.head(4)

694     3 tuck mba graduate made successful career pivot
733    stock still expensive rising rate may shock fi...
367    despite plea leniency expert say ghislaine max...
695            reit performing compared rest market 2022
Name: lemmetized_titles, dtype: object

In [26]:
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
score_pred = model.predict_proba(X_test)[:,1]

### Train Set Scores 

In [27]:
y_train_pred = model.predict(X_train)

In [28]:
accuracy_score(y_train, y_train_pred)

0.976905311778291

In [29]:
precision_score(y_train, y_train_pred)

0.9896640826873385

In [30]:
recall_score(y_train, y_train_pred)

0.9364303178484108

In [31]:
f1_score(y_train, y_train_pred)

0.9623115577889447

In [32]:
roc_auc_score(y_train, y_train_pred)

0.9659679679129695

### Test Set Scores 

In [33]:
accuracy_score(y_test, y_pred)

0.9193548387096774

In [34]:
precision_score(y_test, y_pred)

0.9553571428571429

In [35]:
recall_score(y_test, y_pred)

0.781021897810219

In [36]:
f1_score(y_test, y_pred)

0.8594377510040162

In [37]:
roc_auc_score(y_test, y_pred)

0.8820934404876011

In [38]:
confusion_matrix(y_test, y_pred)

array([[292,   5],
       [ 30, 107]], dtype=int64)

In [None]:
#Let's store the model for later use:
import pickle
pickle.dump(model, open('SimpleRelScoreModel.sav', 'wb'))

### Check Manoel File 

In [39]:
df_study = pd.read_csv('../OneYearNewsDataset.csv')

In [40]:
df_study.head(5)

Unnamed: 0.1,Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry
0,0,https://www.digitaljournal.com/pr/longhash-ven...,,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,,digitaljournal.com,English,United States
1,1,https://www.prnewswire.com/news-releases/terra...,,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,,prnewswire.com,English,United States
2,2,https://www.fool.com/investing/2022/04/06/can-...,,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,,fool.com,English,United States
3,3,https://www.finanznachrichten.de/nachrichten-2...,,Gold Terra Resource Corp : Gold Terra Intersec...,20220406T123000Z,,finanznachrichten.de,English,Germany
4,4,https://economictimes.indiatimes.com/tech/tech...,,Crypto wallet Leap raises $3 . 2 million throu...,20220406T114500Z,,economictimes.indiatimes.com,English,India


In [41]:
df_study.drop(['Unnamed: 0'],axis=1,inplace=True)

In [42]:
df_study['lemmetized_titles'] = df_study['title'].map(lambda x: x.lower())
df_study['lemmetized_titles'] = df_study['lemmetized_titles'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
df_study['lemmetized_titles'] = remove_stop_words(df_study['lemmetized_titles'])
df_study['lemmetized_titles'] = df_study['lemmetized_titles'].apply(lemmetize_titles)

In [43]:
proba = model.predict_proba(df_study['lemmetized_titles'])[:,1]

In [44]:
df_study['relevance_probability']=proba

In [45]:
Strength_ = model.predict(df_study['lemmetized_titles'])

In [46]:
df_study['relevance_class']=Strength_

In [47]:
df_study

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class
0,https://www.digitaljournal.com/pr/longhash-ven...,,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0
1,https://www.prnewswire.com/news-releases/terra...,,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0
2,https://www.fool.com/investing/2022/04/06/can-...,,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0
3,https://www.finanznachrichten.de/nachrichten-2...,,Gold Terra Resource Corp : Gold Terra Intersec...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0
4,https://economictimes.indiatimes.com/tech/tech...,,Crypto wallet Leap raises $3 . 2 million throu...,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0
...,...,...,...,...,...,...,...,...,...,...,...
243499,https://www.crowdfundinsider.com/2021/04/17474...,https://www.crowdfundinsider.com/2021/04/17474...,"Digital Asset Firm Paxos Raises $300 Million ,...",20210429T151500Z,https://www.crowdfundinsider.com/wp-content/up...,crowdfundinsider.com,English,United Kingdom,digital asset firm paxos raise 300 million val...,0.497565,0.0
243500,https://www.jdsupra.com/legalnews/occ-conditio...,,OCC Conditionally Approves Trust Company Chart...,20210429T204500Z,https://jdsupra-static.s3.amazonaws.com/profil...,jdsupra.com,English,United States,occ conditionally approves trust company chart...,0.520409,1.0
243501,https://www.bangkokpost.com/tech/2107743/think...,,Think twice before going with Huawei,20210429T054500Z,https://static.bangkokpost.com/media/content/2...,bangkokpost.com,English,Thailand,think twice going huawei,0.315166,0.0
243502,http://www.tynmagazine.com/cryptoeconomy-china...,,Cryptoeconomy : China downgrades bitcoin into ...,20210429T153000Z,http://tynmedia.com/tynmag/wp-content/uploads/...,tynmagazine.com,English,United States,cryptoeconomy china downgrade bitcoin investme...,0.333374,0.0


In [48]:
df_study.to_csv('../OneYearNewsDataset_AfterRelevance.csv')