# Sentiment140 and Twitter Data Cleaning/Preprocessing

In [58]:
import pandas as pd
import numpy as np
import time

import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Sentiment 140

In [60]:
sentiment140 = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/sentiment140.csv', encoding="ISO-8859-1", names=["0", "1", "2", "3", "4", "5"])

In [61]:
sentiment140.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Text Cleaning

In [63]:
def clean_times(df):
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  months = ['Apr','May','Jun']
  days_regex = "(" + ")|(".join(days) + ")"
  months_regex = "(" + ")|(".join(months) + ")"

  df['weekday'] = df['created_at'].apply(lambda x: extract_regex(days_regex, x))
  df['month'] = df['created_at'].apply(lambda x: extract_regex(months_regex, x))

  df['created_at'] = df['created_at'].str.replace(days_regex, '')
  df['created_at'] = df['created_at'].str.replace(months_regex, '')
  df['created_at'] = df['created_at'].str.replace('PDT', '')
  df['created_at'] = df['created_at'].str.replace('2009', '')
  df['created_at'] = df['created_at'].str.replace('2009', '')

  df['day'] = df['created_at'].str[2:4]
  df['created_at'] = df['created_at'].str[4:]
  df['created_at'] = df['created_at'].str.strip()
  
  return df

In [64]:
def sentiment_cleaning_pipeline(df):
  df = df.iloc[::20, :]
  df = df.drop(columns = ['3'])
  df = df.rename(columns={'0': 'polarity', '1': 'user_id', '2': 'created_at', '4': 'user_name', '5': 'tweet'})  

  df = clean_times(df)

  return df

In [65]:
sentiment140 = sentiment_cleaning_pipeline(sentiment140)

In [66]:
sentiment140

Unnamed: 0,polarity,user_id,created_at,user_name,tweet,weekday,month,day
0,0,1467810369,22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon,Apr,06
20,0,1467813985,22:20:37,quanvu,@alydesigns i was out most of the day so didn'...,Mon,Apr,06
40,0,1467818020,22:21:39,itsanimesh,really don't feel like getting up today... but...,Mon,Apr,06
60,0,1467822687,22:22:52,xVivaLaJuicyx,"@BatManYNG I miss my ps3, it's out of commissi...",Mon,Apr,06
80,0,1467834053,22:25:52,thelazyboy,sleep soon... i just hate saying bye and see y...,Mon,Apr,06
...,...,...,...,...,...,...,...,...
1599900,4,2193574897,08:38:37,johnwelshphd,@cris1015 My goal on stocks like $MTXX is to h...,Tue,Jun,16
1599920,4,2193575839,08:38:42,bendotorg,Largest/fastest hotspot in U.S.A. is located i...,Tue,Jun,16
1599940,4,2193576797,08:38:47,jaycemiguel,"@MsKCJones Yeah it's very irritating, right? B...",Tue,Jun,16
1599960,4,2193577726,08:38:52,FrayBaby,@pokapolas love the donut and the toadstool.,Tue,Jun,16


## Preprocessing

In [67]:
english_stopwords = set(stopwords.words("english"))
lemma = WordNetLemmatizer()

In [68]:
def clean_tweet(tweet):
  tweet = re.sub('http\S+', '', tweet) # remove url
  tweet = re.sub('@\S+', '', tweet) # remove handle
  tweet = re.sub('[^a-zA-Z]', ' ', tweet) # remove non alphanumeric characters
  tweet = str(tweet).lower() # make lowercase
  tweet = word_tokenize(tweet) # tokenize into a list
  tweet = [word for word in tweet if word not in english_stopwords] # remove stopwords
  tweet = [lemma.lemmatize(word=w, pos='v') for w in tweet] # lemmatize
  tweet = [word for word in tweet if len(word) > 2] # remove tokens smaller than 3
  tweet = ' '.join(tweet) # rejoin tweet
  return tweet

In [69]:
sentiment140.head()

Unnamed: 0,polarity,user_id,created_at,user_name,tweet,weekday,month,day
0,0,1467810369,22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon,Apr,6
20,0,1467813985,22:20:37,quanvu,@alydesigns i was out most of the day so didn'...,Mon,Apr,6
40,0,1467818020,22:21:39,itsanimesh,really don't feel like getting up today... but...,Mon,Apr,6
60,0,1467822687,22:22:52,xVivaLaJuicyx,"@BatManYNG I miss my ps3, it's out of commissi...",Mon,Apr,6
80,0,1467834053,22:25:52,thelazyboy,sleep soon... i just hate saying bye and see y...,Mon,Apr,6


In [70]:
sentiment140['tweet'] = sentiment140['tweet'].apply(clean_tweet)

In [71]:
sentiment140.head()

Unnamed: 0,polarity,user_id,created_at,user_name,tweet,weekday,month,day
0,0,1467810369,22:19:45,_TheSpecialOne_,switchfoot awww bummer shoulda get david carr ...,Mon,Apr,6
20,0,1467813985,22:20:37,quanvu,alydesigns day get much,Mon,Apr,6
40,0,1467818020,22:21:39,itsanimesh,really feel like get today get study tomorrows...,Mon,Apr,6
60,0,1467822687,22:22:52,xVivaLaJuicyx,batmanyng miss commission wutcha play cop bloo...,Mon,Apr,6
80,0,1467834053,22:25:52,thelazyboy,sleep soon hate say bye see tomorrow night,Mon,Apr,6


Twitter Tweets