<a href="https://colab.research.google.com/github/mchanwa/COS424/blob/main/COS424_HW1Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing Code for Twitter Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install emoji

import pandas as pd
import numpy as np
import regex as re
import string
import nltk
import emoji

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english')) - {'all'}

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |██▌                             | 10kB 6.1MB/s eta 0:00:01[K     |█████                           | 20kB 9.7MB/s eta 0:00:01[K     |███████▌                        | 30kB 12.7MB/s eta 0:00:01[K     |██████████                      | 40kB 11.1MB/s eta 0:00:01[K     |████████████▌                   | 51kB 8.4MB/s eta 0:00:01[K     |███████████████                 | 61kB 6.7MB/s eta 0:00:01[K     |█████████████████▌              | 71kB 7.5MB/s eta 0:00:01[K     |████████████████████            | 81kB 7.9MB/s eta 0:00:01[K     |██████████████████████▌         | 92kB 7.3MB/s eta 0:00:01[K     |█████████████████████████       | 102kB 7.6MB/s eta 0:00:01[K     |███████████████████████████▌    | 112kB 7.6MB/s eta 0:00:01[K     |██████████████████████████████  | 122kB 7.6MB/s eta 0:00:

In [None]:
# Gets the part of speech tag of word for lemmatization
# This function is based on code from:
#   https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Preprocesses the tweets text
# This function is based on code from:
#   https://www.pluralsight.com/guides/building-a-twitter-sentiment-analysis-in-python
def preprocess_text(tweet):
    # Changes emojis to words
    tweet = emoji.demojize(tweet,  delimiters=(' ', ' '))
    # Removes 'RT' from tweet
    tweet = re.sub(r'RT[\s]+', '', tweet)
    # Removes capitalization
    tweet = tweet.lower()
    # Removes urls & user mentions from tweet
    tweet = re.sub(r"http\S+|www\S+|https\S+|\@\w+", ' ', tweet, flags=re.MULTILINE)
    # Removes punctuation
    tweet = re.sub(r'\p{P}+', '', tweet)
    # Removes stopwords
    tokens = [w for w in word_tokenize(tweet) if not w in stop_words]
    # Perfoms lemmatization on tokens
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    return " ".join(lemma_words)

# Preprocesses the text of the Tweets in the df and returns the df
# By default, this removes the Tweets with the "neither" label
def preprocess_df(df, remove_neither=True):
  idx = "text"
  length = len(df[idx])
  for ii in range(length):
    tweet = str(df[idx][ii])
    df.loc[ii, idx] = preprocess_text(tweet)
  if (remove_neither):
    return df[df['BLM'] != "neither"]
  else:
    return df

In [None]:
# Retrieves and preprocesses the training dataset
path = "/content/drive/MyDrive/Colab Notebooks/train.csv" # Path to train dataset
train_df = pd.read_csv(path)
train_df.fillna("", inplace=True) # fills any NaN values with empty strings
train_df = preprocess_df(train_df)
train_df.head(5)


FileNotFoundError: ignored

In [None]:
# Uses a CountVectorizer to construct bag-of-words matrix
vectorizer = CountVectorizer() # Add a comment about the max_features & ngram_range parameters
# train_vocab is an 2d array of the vocab from the training dataset 
train_vocab = vectorizer.fit_transform(train_df['text']).toarray()
# train_vocab_df is a dataframe where the element ij is the number of times word j occurred in Tweet i
train_vocab_df = pd.DataFrame(train_vocab, columns=vectorizer.get_feature_names())
train_labels = train_df['BLM']

In [None]:
train_vocab_df.head()

Unnamed: 0,02125,03,07,10,100,1000,10000,100000,10003,1000th,100letterstomaketheworldbetter,100plus,100reasons,100yr,101,1010,101015,1024,1029,1030am,1032,10456,1047fm,105,109milelong,10mostfascinatingpeople,10pm,10th,10x,10yr,11,110,1100,110000,1110c,113014,1130am,117,11915,11am,...,yourewelcome,yourlifematters,yourseves,youth,youthdevelopment,youthisthetruth,youthviolence,youtube,youve,yoyo,yr,ystrday,yu,yung,yup,yuvette,zachary,zacharyhammond,zacharyhammonds,zapiro,zaria,zemirbegic,zero,zi,zilphia,zimbabwe,zimmerman,zind,zinnbookfest,zion,zionist,zipper,ziptied,zombie,zone,zone17,zubat,zurbanotorres,zwarte,zwartepiet
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Retrieves and preprocesses the test dataset
path = "/content/drive/MyDrive/Colab Notebooks/test.csv" # Path to test dataset
test_df = pd.read_csv(path)
test_df.fillna("", inplace=True) # fills any NaN values with empty strings
test_df = preprocess_df(test_df)

In [None]:
# Uses the vocab from the training dataset to vectorize the test dataset
test_vocab = vectorizer.transform(test_df['text']).toarray()
# test_vocab_df is a dataframe where the element ij is the number of times word j
# occurred in Tweet i
test_vocab_df = pd.DataFrame(test_vocab, columns=vectorizer.get_feature_names())
test_labels = test_df['BLM']

In [None]:
print(f"Number of neither Tweets in training: {len(train_df[train_df['BLM'] == 'neither'])}")
print(f"Number of positive Tweets in training: {len(train_df[train_df['BLM'] == 'positive'])}")
print(f"Number of negative Tweets in training: {len(train_df[train_df['BLM'] == 'negative'])}")

Number of neither Tweets in training: 0
Number of positive Tweets in training: 5528
Number of negative Tweets in training: 1219


In [None]:
print(f"Number of neither Tweets in test: {len(test_df[test_df['BLM'] == 'neither'])}")
print(f"Number of positive Tweets in test: {len(test_df[test_df['BLM'] == 'positive'])}")
print(f"Number of negative Tweets in test: {len(test_df[test_df['BLM'] == 'negative'])}")

Number of neither Tweets in test: 0
Number of positive Tweets in test: 1383
Number of negative Tweets in test: 305
