# Setting up

In [None]:
!rm -rf sample_data

In [None]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!cp -r drive/MyDrive/newz-aware/ /content/newz-aware

In [None]:
%cd newz-aware/

/content/newz-aware


In [None]:
!pip install -r requirements.txt

Collecting en-core-web-sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (from -r requirements.txt (line 1))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib==1.1.0 (from -r requirements.txt (line 2))
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.0/307.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk==3.7 (from -r requirements.txt (line 3))
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy==3.2.2 (from -r requirements.txt (line 4))
  Downloadi

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!python -m spacy download en_core_web_sm

2024-03-10 16:50:18.854132: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-03-10 16:50:18.854184: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl#egg=en_core_web_sm==3.2.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting en-core-web-sm==3.2.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2

# Preprocessing Data

In [None]:
df = pd.read_csv('TrainingData/scrapedDataset.csv')

X = df['title']
y = df['bias']

In [None]:
import spacy
from nltk.corpus import stopwords
import re

class PreProcessor:

    def __init__(self, lang = 'english'):
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(stopwords.words(lang))

    def remove_special_chars(self, doc):

        doc = " ".join(re.findall(r'[a-zA-Z0-9]+', doc))
        return doc.lower()

    def lemmatize(self, doc):
        doc = self.nlp(doc)
        tokenized = [token.lemma_ for token in doc]
        return tokenized

    def remove_stop_words(self, tokenized_doc):

        tokenized_doc_no_stopwords = []

        for word in tokenized_doc:
            if word not in self.stop_words:
                tokenized_doc_no_stopwords.append(word)

        return tokenized_doc_no_stopwords

    def forward(self, doc):
        doc = self.remove_special_chars(doc)
        doc = self.lemmatize(doc)
        doc = self.remove_stop_words(doc)

        return " ".join(doc)

preproc = PreProcessor()

for i,x in enumerate(X):
    X[i] = preproc.forward(str(X[i]))
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[i] = preproc.forward(str(X[i]))


0       zimmerman verdict renew focus stand ground law
1    senate immigration bill pass judiciary committ...
2    let doma fool supreme court restrict right was...
3    obamacare trouble exchange provision delay law...
4    exclusive immigration agent rip house lawmaker...
Name: title, dtype: object

In [None]:
y.replace(to_replace = list(range(-4,5)), value=[0,0,0,1,1,1,2,2,2], inplace=True)
y.value_counts()

1    17510
2     1878
0     1616
Name: bias, dtype: int64

In [None]:
df = pd.concat([X,y], axis=1)
df

Unnamed: 0,title,bias
0,zimmerman verdict renew focus stand ground law,1
1,senate immigration bill pass judiciary committ...,1
2,let doma fool supreme court restrict right was...,1
3,obamacare trouble exchange provision delay law...,1
4,exclusive immigration agent rip house lawmaker...,1
...,...,...
20999,dana milbank senator turn table cayman investo...,2
21000,republicans hatred obama blind public disinter...,2
21001,dana milbank stockman step forward republican ...,1
21002,e j dionne republican problem solver washingto...,2


In [None]:
df.drop(df[df['title']=='nan'].index, inplace=True)

In [None]:
df_class0 = df[df['bias'] == 0]
df_class1 = df[df['bias'] == 1]
df_class2 = df[df['bias'] == 2]

In [None]:
df_class0_oversampled = df_class0.sample(df_class1.shape[0], replace = True)
df_class2_oversampled = df_class2.sample(df_class1.shape[0], replace = True)

print(df_class0_oversampled.shape)
print(df_class2_oversampled.shape)

(12098, 2)
(12098, 2)


In [None]:
df_oversampled = pd.concat([df_class1, df_class0_oversampled, df_class2_oversampled], axis = 0)
print(df_oversampled.shape)
print(df_oversampled['bias'].value_counts())

(36294, 2)
1    12098
0    12098
2    12098
Name: bias, dtype: int64


In [None]:
df = df_oversampled

In [None]:
X = df['title']
y = df['bias']

In [None]:
from sklearn.model_selection import train_test_split
train_X, t_X, train_y, t_y = train_test_split(X, y, test_size=0.2, random_state=101)
test_X, dev_X, test_y, dev_y = train_test_split(t_X, t_y, test_size=0.5, random_state=101)

In [None]:
train_X_orig = train_X.copy(deep=True)
dev_X_orig = dev_X.copy(deep=True)
test_X_orig = test_X.copy(deep=True)

train_y_orig = train_y.copy(deep=True)
dev_y_orig = dev_y.copy(deep=True)
test_y_orig = test_y.copy(deep=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 8000, lowercase=False, ngram_range=(1,2))
train_X = vectorizer.fit_transform(train_X).toarray()
dev_X = vectorizer.transform(dev_X).toarray()
test_X = vectorizer.transform(test_X).toarray()

In [None]:
import joblib
joblib.dump(vectorizer, 'tfidf_for_bias.pkl')

['tfidf_for_bias.pkl']