# Sentiment140 and Twitter Data Cleaning/Preprocessing

In [172]:
import pandas as pd
import numpy as np
import time

import re

import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [141]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Sentiment 140

In [173]:
sentiment140 = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/sentiment140.csv', encoding="ISO-8859-1", names=["0", "1", "2", "3", "4", "5"])

In [143]:
sentiment140.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Text Cleaning

In [144]:
def extract_regex(regex,tweet):
  if re.search(regex,tweet) is not None:
    return re.search(regex,tweet).group() 
  else:
    return np.nan

In [145]:
def clean_times(df):
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  months = ['Apr','May','Jun']
  days_regex = "(" + ")|(".join(days) + ")"
  months_regex = "(" + ")|(".join(months) + ")"

  df['weekday'] = df['created_at'].apply(lambda x: extract_regex(days_regex, x))
  df['month'] = df['created_at'].apply(lambda x: extract_regex(months_regex, x))

  df['created_at'] = df['created_at'].str.replace(days_regex, '')
  df['created_at'] = df['created_at'].str.replace(months_regex, '')
  df['created_at'] = df['created_at'].str.replace('PDT', '')
  df['created_at'] = df['created_at'].str.replace('2009', '')
  df['created_at'] = df['created_at'].str.replace('2009', '')

  df['day'] = df['created_at'].str[2:4]
  df['created_at'] = df['created_at'].str[4:]
  df['created_at'] = df['created_at'].str.strip()
  
  return df

In [146]:
def sentiment_cleaning_pipeline(df):
  df = df.iloc[::20, :]
  df = df.drop(columns = ['3'])
  df = df.rename(columns={'0': 'polarity', '1': 'user_id', '2': 'created_at', '4': 'user_name', '5': 'tweet'})  
  df = clean_times(df)

  return df

In [147]:
sentiment140 = sentiment_cleaning_pipeline(sentiment140)

## Preprocessing

In [148]:
english_stopwords = set(stopwords.words("english"))
lemma = WordNetLemmatizer()

In [149]:
def clean_tweet(tweet):
  tweet = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', tweet) # remove url
  tweet = re.sub('@\S+', '', tweet) # remove handle
  tweet = re.sub('[^a-zA-Z]', ' ', tweet) # remove non alphanumeric characters
  tweet = str(tweet).lower() # make lowercase
  tweet = word_tokenize(tweet) # tokenize into a list
  tweet = [word for word in tweet if word not in english_stopwords] # remove stopwords
  tweet = [lemma.lemmatize(word=w, pos='v') for w in tweet] # lemmatize
  tweet = [word for word in tweet if len(word) > 2] # remove tokens smaller than 3
  tweet = ' '.join(tweet) # rejoin tweet
  return tweet

In [150]:
sentiment140.head()

Unnamed: 0,polarity,user_id,created_at,user_name,tweet,weekday,month,day
0,0,1467810369,22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon,Apr,6
20,0,1467813985,22:20:37,quanvu,@alydesigns i was out most of the day so didn'...,Mon,Apr,6
40,0,1467818020,22:21:39,itsanimesh,really don't feel like getting up today... but...,Mon,Apr,6
60,0,1467822687,22:22:52,xVivaLaJuicyx,"@BatManYNG I miss my ps3, it's out of commissi...",Mon,Apr,6
80,0,1467834053,22:25:52,thelazyboy,sleep soon... i just hate saying bye and see y...,Mon,Apr,6


In [151]:
sentiment140 = sentiment140.dropna()

In [152]:
sentiment140['tweet'] = sentiment140['tweet'].apply(clean_tweet)

In [153]:
sentiment140 = sentiment140.sample(n=len(sentiment140)//2)

In [None]:
sentiment140['at_risk'] = 0

In [154]:
sentiment140.to_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/cleaned_s140.csv')

In [155]:
sentiment140.head()

Unnamed: 0,polarity,user_id,created_at,user_name,tweet,weekday,month,day
800000,4,1467822272,22:22:45,ersle,love guy best,Mon,Apr,6
800020,4,1467822994,22:22:56,sorano916,place peep contest thank vote anyways,Mon,Apr,6
800040,4,1467824005,22:23:14,subwaystory,dry sweet potato huh,Mon,Apr,6
800060,4,1467824828,22:23:25,suitelifeofkell,sorrry like different things kevin jonas girlf...,Mon,Apr,6
800080,4,1467825506,22:23:36,skinnylatte,depend version think one know like,Mon,Apr,6


## Twitter Tweets

In [156]:
twitter = pd.read_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/suicide_tweets_df.csv')

In [157]:
twitter.head()

Unnamed: 0,created_at,text,user_name,screen_name,location,friends_count,followers_count,is_quote_status,retweet_count,favorite_count,retweeted,favorited,verified,protected,lang
0,2021-03-15 01:22:21,O HARRY FANHOU????,Maria,shawuanlover,𝑺𝒂𝒏 𝑭𝒓𝒂𝒏𝒄𝒊𝒔𝒄𝒐,1359,1520,False,0,0,False,False,False,False,en
1,2021-03-15 01:22:21,YEAHHH HARRYY YOU BLOODY LEGEND YOUUU,EVE⛈HEARTBREAK WEATHER,NEWANGEL93,"San Francisco, CA",449,282,False,0,0,False,False,False,False,en
2,2021-03-15 01:22:21,RT @priceactionkim: 👉 The PATI Members only ne...,Kevin Hunt▫️Forex Trader,uk2asia,Asia/London UK,125,4335,False,1,0,False,False,False,False,en
3,2021-03-15 01:22:21,RT @warriors: 32 points &amp; 9 dimes on his 3...,KiDD | kaSOLID ✌,aldrinkidd_,"Angeles City, Central Luzon",769,658,False,556,0,False,False,False,False,en
4,2021-03-15 01:22:20,RT @RBReich: I don't know who needs to hear th...,jhin🇺🇸💙,jhin58794874,,625,174,False,2006,0,False,False,False,False,en


In [158]:
def find_weekday(row, day_not_weekday=False):
  regex_match = re.match('^([^\s]+)', row)
  date = regex_match.group()
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  year, month, day = (int(x) for x in date.split('-'))

  if day_not_weekday:
    return day
  else:
    return days[datetime.date(year, month, day).weekday()]

In [159]:
def clean_twitter_times(df):
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

  df['weekday'] = df['created_at'].apply(find_weekday)
  df['month'] = 'Mar'
  df['day'] = df['created_at'].apply(find_weekday, day_not_weekday=True)

  df['created_at'] = df['created_at'].str[-8:]
  df['created_at'] = df['created_at'].str.strip()
  
  return df

In [160]:
def twitter_cleaning_pipeline(df):
  df = df.drop(columns = [col for col in df.columns if col not in ['created_at', 'text', 'screen_name']])
  df = df.rename(columns={'screen_name': 'user_name', 'text':'tweet'})  
  df = clean_twitter_times(df)

  return df

In [161]:
twitter = twitter_cleaning_pipeline(twitter)

In [162]:
twitter = twitter.dropna()

In [164]:
twitter['tweet'] = twitter['tweet'].apply(clean_tweet)

In [165]:
twitter

Unnamed: 0,created_at,tweet,user_name,weekday,month,day
0,01:22:21,harry fanhou,shawuanlover,Mon,Mar,15
1,01:22:21,yeahhh harryy bloody legend youuu,NEWANGEL93,Mon,Mar,15
2,01:22:21,pati members newsletter send inbox please chec...,uk2asia,Mon,Mar,15
3,01:22:21,point amp dim birthday dubnation,aldrinkidd_,Mon,Mar,15
4,01:22:20,know need hear bipartisanship pay bill put foo...,jhin58794874,Mon,Mar,15
...,...,...,...,...,...,...
9915,01:23:31,fuck alone fuck depress want anymore,superdupercud,Mon,Mar,8
9916,01:21:55,want get high play video game parent house wan...,dietcucumber,Mon,Mar,8
9917,00:22:05,tire feel alone time feel like drown let anyon...,taespooon,Mon,Mar,8
9918,00:16:04,tell friends dont want let friendship die also...,androidWG,Mon,Mar,8


In [174]:
twitter['at_risk'] = 1

In [171]:
twitter.to_csv('/content/drive/MyDrive/Gap Year/SureStart/Makeathon/data/cleaned data/cleaned_twitter.csv')