<a href="https://colab.research.google.com/github/papaymaguire/ece219-project1/blob/main/project1/notebooks/question03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata
gh_pat = userdata.get('gh_pat')
gh_username = userdata.get('gh_username')

In [2]:
!rm -rf ece219-project1
!git clone https://{gh_username}:{gh_pat}@github.com/papaymaguire/ece219-project1.git

Cloning into 'ece219-project1'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 139 (delta 60), reused 84 (delta 30), pack-reused 0[K
Receiving objects: 100% (139/139), 19.87 MiB | 7.22 MiB/s, done.
Resolving deltas: 100% (60/60), done.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
sys.path.insert(0,'/content/ece219-project1')

import pandas as pd
import numpy as np
import random

np.random.seed(42)
random.seed(42)

In [5]:
from project1.utils.GoogleDriveDataIO import GoogleDriveDataIO
drive_io = GoogleDriveDataIO("/content/drive", "My Drive/EC ENGR 219/Project 1/Data")

In [6]:
train = drive_io.load("train")
test = drive_io.load("test")

In [7]:
from sklearn.pipeline import Pipeline
from project1.utils.TextPreprocessor import TextPreprocessor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
def build_vocab_pipe (type, min_df):

    vocab_pipe = Pipeline(steps=[
        ("preprocess", TextPreprocessor(type, n_jobs=2)),
        ("count", CountVectorizer(stop_words="english", min_df=min_df))
    ])
    return vocab_pipe

In [8]:
pipe_lemm_3 = build_vocab_pipe("lemm", 3)
pipe_lemm_5 = build_vocab_pipe("lemm", 5)
pipe_stem_3 = build_vocab_pipe("stem", 3)
pipe_stem_5 = build_vocab_pipe("stem", 5)

In [9]:
pipe_lemm_3.fit(train['full_text'])
pipe_lemm_5.fit(train['full_text'])
pipe_stem_3.fit(train['full_text'])
pipe_stem_5.fit(train['full_text'])

In [10]:
vocab_lemm_3 = pipe_lemm_3['count'].vocabulary_
vocab_lemm_5 = pipe_lemm_5['count'].vocabulary_
vocab_stem_3 = pipe_stem_3['count'].vocabulary_
vocab_stem_5 = pipe_stem_5['count'].vocabulary_

In [11]:
print("Size of vocabulary with lemmatization: ")
print(len(vocab_lemm_3))
print("Size of vocabulary with stemming: ")
print(len(vocab_stem_3))

Size of vocabulary with lemmatization: 
14097
Size of vocabulary with stemming: 
12179


In [12]:
def build_tfidf_pipe (vocab_model):
    tfidf_pipe = Pipeline(steps=[
        ("preprocess", TextPreprocessor("lemm", n_jobs=-1)),
        ("count", vocab_model),
        ("tfidf", TfidfTransformer())
    ])
    return tfidf_pipe

In [13]:
tfidf_pipe_3 = build_tfidf_pipe(pipe_lemm_3['count'])
tfidf_pipe_5 = build_tfidf_pipe(pipe_lemm_5['count'])


In [14]:
tfidf_pipe_3.fit(train['full_text'])
tfidf_pipe_5.fit(train['full_text'])

In [15]:
tfidf_matrix_3 = tfidf_pipe_3.transform(train['full_text'])
tfidf_matrix_5 = tfidf_pipe_5.transform(train['full_text'])

In [16]:
print("Shape of TF-IDF matrix with min_df=3: ")
print(tfidf_matrix_3.shape)
print("Shape of TF-IDF matrix with min_df=5: ")
print(tfidf_matrix_5.shape)

Shape of TF-IDF matrix with min_df=3: 
(2780, 14097)
Shape of TF-IDF matrix with min_df=5: 
(2780, 9864)


Choosing lemmatization and min_df=3 for the last bullet point

In [17]:
train_features = tfidf_pipe_3.transform(train['full_text'])
test_features = tfidf_pipe_3.transform(test['full_text'])

In [18]:
print("Shape of TF-IDF processed train set: ")
print(train_features.shape)
print("Shape of TF-IDF processed test set: ")
print(test_features.shape)

Shape of TF-IDF processed train set: 
(2780, 14097)
Shape of TF-IDF processed test set: 
(696, 14097)


In [19]:
drive_io.save("train_features", train_features)
drive_io.save("test_features", test_features)

'/content/drive/My Drive/EC ENGR 219/Project 1/Data/test_features'