## Imports / settings

In [17]:
# General imports
import string

# Analysis imports
import pandas as pd
import numpy as np

# NLP imports
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas settings
pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 90

# Downloads (for NLP)
import nltk
nltk.download('wordnet')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger');

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Functions

These are helper functions that assist in the manipulation of tweet strings for pre-processing purposes.

In [18]:
def strip_rt_user(text):
    if text[0:2] == "RT":
        colon = text.find(":")
        return text[colon+1:].lower()
    else:
        return text.lower()

def get_rt_user(text):
    if text[0:2] == "RT":
        colon = text.find(":")
        user = text[:colon]
        at = user.find("@")
        return (user[at+1:]).lower()
    else:
        return ""

def addHashTags(text):
    return "#" + text + "#"

# Translate nltk POS to wordnet tags
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def remove_characters(text, char_to_remove):
    str1 = ''.join(x for x in text if not x in char_to_remove)
    return str1

def remove_punctuation(text):
    text = remove_characters(text, string.punctuation)
    return text

def tag_and_lemmatize(text):
    newText = text
    newText = pos_tag(newText)
    newText = [(x[0], get_wordnet_pos(x[1])) for x in newText]
    lemma = nltk.stem.WordNetLemmatizer()
    newText = [(lemma.lemmatize(x[0], x[1])) for x in newText]
    return newText

def dummy_fun(doc):
    return doc

# perform all pre-processing on a df
def preprocessing(df):
    preprocessing_01_model_specific(df)
    preprocessing_02_general(df)
    preprocessing_03_tag_and_lemmatize(df)
    
    
def preprocessing_01_model_specific(df):
    # Copy the RT user name from the text column and put it into a different column.
    df['RT_user'] = df['text'].apply(get_rt_user)
    df['RT_user'] = df['RT_user'].apply(lambda x: addHashTags(x) if x != "" else "")

    # Pull out the RT user name from the text column
    df['text'] = df['text'].apply(strip_rt_user)
    
def preprocessing_02_general(df):
    # Lower case the text tweets
    df['text'] = df['text'].str.lower()

    # Strip out the meaningless links
    df['text'] = df['text'].apply(lambda x: " ".join([n for n in x.split() if n[0:4] != "http"]))

    # Strip any excess white space
    df['text'] = df['text'].apply(lambda x: x.strip())
    
    # Take out stop words
    sw = set(stopwords.words('english'))
    sw.update(['amp'])
    df['text'] = df['text'].apply(lambda x: " ".join([n for n in x.split() if n not in sw]))

    # Remove punctuation
    df['text'] = df['text'].apply(lambda x: remove_punctuation(x))

    # Make sure we don't have any random numbers
    df['text'] = df['text'].apply(lambda x: " ".join([n for n in x.split() if n.isnumeric() == False]))

    # Put together the RT user and the tweet text
    df['text'] = df['text'] + " " + df['RT_user']

    # Make a new column, tokenize the words
    df['text_tokenized'] = df['text'].str.split()
    
    df = df.drop(columns=['id', 'author_id', 'created_at'])
    
    df['text'] = df['text'].apply(lambda x: np.nan if len(x.strip()) == 0 else x)
    df = df.dropna().reset_index(drop=True) 

    le = LabelEncoder()
    df['class_label'] = le.fit_transform(df['class'])
    df.head()
    
def preprocessing_03_tag_and_lemmatize(df):
    df['text_tokenized'] = df['text_tokenized'].apply(tag_and_lemmatize)

## Load tweet data

Load the tweet data from file.  Model 1 takes from tweet_list2.csv, which is a scaled down version of all tweet data. 

In [19]:
# Load tweets from file
tweet_list_file = 'data/tweet_list2.csv'
df = pd.read_csv(tweet_list_file)

# Format all series as strings
for n in df.columns:
    df[n] = df[n].astype(str)

# Check out the data
df.head()

Unnamed: 0,user_name,class,id,text,author_id,created_at
0,TeamPelosi,Politics - Liberal,1.62e+18,"On this day 83 years ago, Democrats Delivered the first Social Security checks ever! ...",2461810448.0,2023-01-31 21:00:26+00:00
1,TeamPelosi,Politics - Liberal,1.62e+18,We must keep our children safe from gun violence. Safe storage of guns saves lives and...,2461810448.0,2023-01-30 18:45:49+00:00
2,TeamPelosi,Politics - Liberal,1.62e+18,Democrats believe that health care is a human right and #DemocratsDelivered help for ...,2461810448.0,2023-01-28 21:20:12+00:00
3,TeamPelosi,Politics - Liberal,1.62e+18,Congratulations @PADems for your hard-won victories electing Pennsylvania Democrats wh...,2461810448.0,2023-01-28 04:00:31+00:00
4,TeamPelosi,Politics - Liberal,1.62e+18,My heart goes out to Tyre Nichols mother and their entire family. Tyre should be alive...,2461810448.0,2023-01-28 02:15:32+00:00


## Data cleaning

**Check for nulls**

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101074 entries, 0 to 101073
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_name   101074 non-null  object
 1   class       101074 non-null  object
 2   id          101074 non-null  object
 3   text        101074 non-null  object
 4   author_id   101074 non-null  object
 5   created_at  101074 non-null  object
dtypes: object(6)
memory usage: 4.6+ MB


Notes:
- There are no null values, which makes sense because I downloaded this data myself. 

**Check for duplicates**

In [21]:
df.duplicated().sum()

804

Notes:
- I have some duplicate tweets.  As I noted in the data collection notebook, I must have downloaded some tweets from the same account multiple times while performing the download function. 

**Drop duplicates**

In [22]:
df = df.drop_duplicates()
df.duplicated().sum()

0

Notes:
- Duplicates have been deleted.

## Data review

Check class balance at the tweet level

In [23]:
df['class'].value_counts()

Business and finance       21614
Science / Technology       15548
Politics - Conservative    15500
TV / movies                12007
Politics - Liberal         12001
Sports                     12000
Music                      11600
Name: class, dtype: int64

Notes: 
- It's imbalanced but I'm going to leave it and see if we can still make predictions from the data we have

## Pre-processing 

**Warning** This code performs all pre-processing, including lemmatization of the tweet text.  As such, it takes a few minutes to run.  

In [None]:
# Make a copy of the df, leave the original untouched
df_pp = df.copy()
preprocessing(df_pp)
df_model = df_pp.copy()

Make sure there's no nulls after processing

In [None]:
df_model.isna().sum()

First, let's try to predict the primary interest of the user between our main classifications:
- Politics
- Sports and entertainment
- Business and finance
- Science / Technology

In [None]:
df_model.loc[(df_pp['class'] == 'Politics - Conservative') | (df_pp['class'] == 'Politics - Liberal'), 'class'] = 'Politics'
df_model.loc[(df_pp['class'] == 'Music') | (df_pp['class'] == 'TV / movies') | (df_pp['class'] == 'Sports'), 'class'] = 'Sports / Entertainment'
df_model.loc[(df_pp['class'] == 'Business and finance'), 'class'] = 'Business'
df_model = df_model.loc[(df_pp['class'] != 'Travel')]

df_model['class'].value_counts()

Aggregate all text words by account

In [11]:
df_model = df_model.groupby(['class']).agg({'text_tokenized': 'sum'}).reset_index()
df_model

NameError: name 'df_model' is not defined

In [None]:
df_model.to_csv("saved_model.csv")

In [12]:
# set style for SNS to get higher resolution pics
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
plt.style.use('ggplot')

## EDA

### Business

In [13]:
df_subset = df_model[(df_model['class']=='Business')]
df_subset = df_subset.text_tokenized.explode()
df_subset.value_counts()
# sns.set_style("darkgrid")

graphinfo = df_subset.value_counts()[0:12]
fix, ax = plt.subplots(figsize=(4.5,3)) 
ax = sns.barplot(graphinfo.index, graphinfo, ax = ax, color='blue')
plt.xticks(rotation=90)
ax.set_ylabel("count", fontsize=10)
ax.set_xlabel('words', fontsize=10)
ax.tick_params(labelsize=10)
ax.set_title('Words in Business Tweets', fontsize=12);

NameError: name 'df_model' is not defined

### Sports / Entertainment

In [14]:
df_subset = df_model[(df_model['class']=='Sports / Entertainment')]
df_subset = df_subset.text_tokenized.explode()
df_subset.value_counts()
# sns.set_style("darkgrid")


graphinfo = df_subset.value_counts()[0:12]
fix, ax = plt.subplots(figsize=(4.5,3)) 
ax = sns.barplot(graphinfo.index, graphinfo, ax = ax, color='blue')
plt.xticks(rotation=90)
ax.set_ylabel("count", fontsize=10)
ax.set_xlabel('words', fontsize=10)
ax.tick_params(labelsize=10)
ax.set_title('Words in Sports / Entertainment Tweets', fontsize=12);


NameError: name 'df_model' is not defined

### Politics

In [15]:
df_subset = df_model[(df_model['class']=='Politics')]
df_subset = df_subset.text_tokenized.explode()
df_subset.value_counts()
# sns.set_style("darkgrid")



graphinfo = df_subset.value_counts()[0:12]
fix, ax = plt.subplots(figsize=(4.5,3)) 
ax = sns.barplot(graphinfo.index, graphinfo, ax = ax, color='blue')
plt.xticks(rotation=90)
ax.set_ylabel("count", fontsize=10)
ax.set_xlabel('words', fontsize=10)
ax.tick_params(labelsize=10)
ax.set_title('Words in Politics Tweets', fontsize=12);

NameError: name 'df_model' is not defined

### Science / Technology

In [16]:
df_subset = df_model[(df_model['class']=='Science / Technology')]
df_subset = df_subset.text_tokenized.explode()
df_subset.value_counts()
# sns.set_style("darkgrid")



graphinfo = df_subset.value_counts()[0:12]
fix, ax = plt.subplots(figsize=(4.5,3)) 
ax = sns.barplot(graphinfo.index, graphinfo, ax = ax, color='blue')
plt.xticks(rotation=90)
ax.set_ylabel("count", fontsize=10)
ax.set_xlabel('words', fontsize=10)
ax.tick_params(labelsize=10)
ax.set_title('Words in Science / Technology', fontsize=12);


NameError: name 'df_model' is not defined