# Cleaning Tweets - NLP

### Importing Libraries

In [443]:
import numpy as np
import random
import _pickle as pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
import nltk
import string
import re
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV

### Importing DataFrames

In [429]:
# Loading in the first DF
with open("city1_df.pkl",'rb') as fp:
    city_1 = pickle.load(fp)

# Loading in the second DF
with open("city2_df.pkl",'rb') as fp:
    city_2 = pickle.load(fp)

### Checking the DataFrames

In [430]:
#city_1

In [431]:
#city_2

## Cleaning and NLP

### Function Balancing the class values

In [432]:
def city_balance(city_1, city_2):
    """
    Balances the number of unique tweets from each city
    Removes brief tweets
    """
#     # Removing tweets with less than 10 characters
#     city_1 = city_1[city_1.tweet.map(len)>10]
#     city_2 = city_2[city_2.tweet.map(len)>10]
    
#     # Removing tweets from the same user
#     city_1 = city_1.drop_duplicates('user_id')
#     city_2 = city_2.drop_duplicates('user_id')

    # Checking for class balance
    dif = abs(len(city_1) - len(city_2))

    # Randomly dropping rows from the DF with a greater number of rows
    if len(city_1) > len(city_2):
        city_1 = city_1.drop(random.sample(list(city_1.index), dif))
    elif len(city_2) > len(city_1):
        city_2 = city_2.drop(random.sample(list(city_2.index), dif))
    else:
        print("DFs are balanced")
    
    print(f"DF Lengths:\nCity 1 = {len(city_1)}\nCity 2 = {len(city_2)}")

    return city_1, city_2

In [433]:
city_1, city_2 = city_balance(city_1, city_2)

DF Lengths:
City 1 = 5004
City 2 = 5004


In [434]:
# Combining both dataframes
main_df = city_1.append(city_2)
main_df[['tweet', 'City']]

Unnamed: 0,tweet,City
0,SCANDAL RANGERS TO THE RESCUE! I'm proud to be...,Seattle
1,Well-what is it?,Seattle
2,Enemy of the people!!!,Seattle
3,Top of the morning üòÉ Mr President,Seattle
4,Is there any line Trump can cross that would m...,Seattle
5,So Trump and his supporters threatening to kil...,Seattle
6,America needs someone who is that tough!!! Res...,Seattle
7,When Trump and his allies finally hit rock bot...,Seattle
8,"seriously, i ‚Äúthought‚Äù he was supposed to be t...",Seattle
9,Nope. Moot. pic.twitter.com/hwbHpQ6OQ0,Seattle


## Vectorizing
- TF_IDF Vectorizer

In [435]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [456]:
# Stopwords list
stop = stopwords.words('english')
stop += list(string.punctuation)

def clean_words(df_col, stop=stop):
    """
    Cleans tweets from dataframe column
    """
    # Lowercasing the words
    data = [tweet.lower() for tweet in df_col]
    # Removing URLs
    data = [re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?¬´¬ª‚Äú‚Äù‚Äò‚Äô]))''', "", i) 
            for i in data]

    # Removing special characters
    data = [nltk.regexp_tokenize(i, "([a-zA-Z]+(?:'[a-z]+)?)") for i in data]

    # Lemmatizing words and removing stop words
    n = []
    for i in data:
        lst = ""
        for x in i:
            x = lemmatizer.lemmatize(x)
            if x not in stop:
                lst += (x+" ")
        n.append(lst)
        
    return n
    
data = clean_words(main_df.tweet, stop)

### Exporting the combined DataFrame and the cleaned tweets

In [457]:
with open("main_df.pkl", "wb") as fp:
    pickle.dump(main_df, fp)
    
with open("clean_tweets.pkl", "wb") as fp:
    pickle.dump(data, fp)