# Sentiment140 and Twitter Data Cleaning/Preprocessing

In [1]:
import pandas as pd
import numpy as np
import time

import re

import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Sentiment 140

In [3]:
sentiment140 = pd.read_csv('/content/drive/MyDrive/makeathon_stuff/sentiment_full.csv', encoding="ISO-8859-1", names=["0", "1", "2", "3", "4", "5"])

In [4]:
sentiment140.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Text Cleaning

In [5]:
def extract_regex(regex,tweet):
  if re.search(regex,tweet) is not None:
    return re.search(regex,tweet).group() 
  else:
    return np.nan

In [6]:
def clean_times(df):
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  months = ['Apr','May','Jun']
  days_regex = "(" + ")|(".join(days) + ")"
  months_regex = "(" + ")|(".join(months) + ")"

  df['weekday'] = df['created_at'].apply(lambda x: extract_regex(days_regex, x))
  df['month'] = df['created_at'].apply(lambda x: extract_regex(months_regex, x))

  df['created_at'] = df['created_at'].str.replace(days_regex, '')
  df['created_at'] = df['created_at'].str.replace(months_regex, '')
  df['created_at'] = df['created_at'].str.replace('PDT', '')
  df['created_at'] = df['created_at'].str.replace('2009', '')
  df['created_at'] = df['created_at'].str.replace('2009', '')

  df['day'] = df['created_at'].str[2:4]
  df['created_at'] = df['created_at'].str[4:]
  df['created_at'] = df['created_at'].str.strip()
  
  return df

In [7]:
def sentiment_cleaning_pipeline(df):
  df = df.iloc[::20, :]
  df = df.drop(columns = ['3'])
  df = df.rename(columns={'0': 'polarity', '1': 'user_id', '2': 'created_at', '4': 'user_name', '5': 'tweet'})  
  df = clean_times(df)

  return df

In [8]:
sentiment140 = sentiment_cleaning_pipeline(sentiment140)

## Preprocessing

In [9]:
english_stopwords = set(stopwords.words("english"))
lemma = WordNetLemmatizer()

In [10]:
def clean_tweet(tweet):
  tweet = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', tweet) # remove url
  tweet = re.sub('@\S+', '', tweet) # remove handle
  tweet = re.sub('[^a-zA-Z]', ' ', tweet) # remove non alphanumeric characters
  tweet = str(tweet).lower() # make lowercase
  tweet = word_tokenize(tweet) # tokenize into a list
  tweet = [word for word in tweet if word not in english_stopwords] # remove stopwords
  tweet = [lemma.lemmatize(word=w, pos='v') for w in tweet] # lemmatize
  tweet = [word for word in tweet if len(word) > 2] # remove tokens smaller than 3
  tweet = ' '.join(tweet) # rejoin tweet
  return tweet

In [11]:
sentiment140.head()

Unnamed: 0,polarity,user_id,created_at,user_name,tweet,weekday,month,day
0,0,1467810369,22:19:45,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon,Apr,6
20,0,1467813985,22:20:37,quanvu,@alydesigns i was out most of the day so didn'...,Mon,Apr,6
40,0,1467818020,22:21:39,itsanimesh,really don't feel like getting up today... but...,Mon,Apr,6
60,0,1467822687,22:22:52,xVivaLaJuicyx,"@BatManYNG I miss my ps3, it's out of commissi...",Mon,Apr,6
80,0,1467834053,22:25:52,thelazyboy,sleep soon... i just hate saying bye and see y...,Mon,Apr,6


In [12]:
sentiment140 = sentiment140.dropna()

In [13]:
sentiment140['tweet'] = sentiment140['tweet'].apply(clean_tweet)

In [14]:
neg = sentiment140[sentiment140['polarity'] == 0]
pos = sentiment140[sentiment140['polarity'] == 4]

neg = neg.sample(n=10000)
pos = pos.sample(n=10000)
neg.reset_index(drop = True, inplace=True)
pos.reset_index(drop = True, inplace=True)
# sentiment_cleaned = neg.append(pos)
# sentiment_cleaned

In [15]:
neg['at_risk'] = 1
pos['at_risk'] = 0
neg.drop(columns='polarity', inplace = True)
pos.drop(columns='polarity', inplace = True)

In [16]:
neg.to_csv('/content/drive/MyDrive/makeathon_stuff/sentiment_negative.csv')
pos.to_csv('/content/drive/MyDrive/makeathon_stuff/sentiment_positive.csv')


## Twitter Tweets

In [17]:
twitter = pd.read_csv('/content/drive/MyDrive/makeathon_stuff/suicide_tweets_df.csv')

In [18]:
twitter.head()

Unnamed: 0,created_at,text,user_name,screen_name,location,friends_count,followers_count,is_quote_status,retweet_count,favorite_count,retweeted,favorited,verified,protected,lang
0,2021-03-15 01:22:21,O HARRY FANHOU????,Maria,shawuanlover,𝑺𝒂𝒏 𝑭𝒓𝒂𝒏𝒄𝒊𝒔𝒄𝒐,1359,1520,False,0,0,False,False,False,False,en
1,2021-03-15 01:22:21,YEAHHH HARRYY YOU BLOODY LEGEND YOUUU,EVE⛈HEARTBREAK WEATHER,NEWANGEL93,"San Francisco, CA",449,282,False,0,0,False,False,False,False,en
2,2021-03-15 01:22:21,RT @priceactionkim: 👉 The PATI Members only ne...,Kevin Hunt▫️Forex Trader,uk2asia,Asia/London UK,125,4335,False,1,0,False,False,False,False,en
3,2021-03-15 01:22:21,RT @warriors: 32 points &amp; 9 dimes on his 3...,KiDD | kaSOLID ✌,aldrinkidd_,"Angeles City, Central Luzon",769,658,False,556,0,False,False,False,False,en
4,2021-03-15 01:22:20,RT @RBReich: I don't know who needs to hear th...,jhin🇺🇸💙,jhin58794874,,625,174,False,2006,0,False,False,False,False,en


In [19]:
def find_weekday(row, day_not_weekday=False):
  regex_match = re.match('^([^\s]+)', row)
  date = regex_match.group()
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
  year, month, day = (int(x) for x in date.split('-'))

  if day_not_weekday:
    return day
  else:
    return days[datetime.date(year, month, day).weekday()]

In [20]:
def clean_twitter_times(df):
  days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

  df['weekday'] = df['created_at'].apply(find_weekday)
  df['month'] = 'Mar'
  df['day'] = df['created_at'].apply(find_weekday, day_not_weekday=True)

  df['created_at'] = df['created_at'].str[-8:]
  df['created_at'] = df['created_at'].str.strip()
  
  return df

In [21]:
def twitter_cleaning_pipeline(df):
  df = df.drop(columns = [col for col in df.columns if col not in ['created_at', 'text', 'screen_name']])
  df = df.rename(columns={'screen_name': 'user_name', 'text':'tweet'})  
  df = clean_twitter_times(df)

  return df

In [22]:
twitter = twitter_cleaning_pipeline(twitter)

In [23]:
twitter = twitter.dropna()

In [24]:
twitter['tweet'] = twitter['tweet'].apply(clean_tweet)

In [25]:
twitter

Unnamed: 0,created_at,tweet,user_name,weekday,month,day
0,01:22:21,harry fanhou,shawuanlover,Mon,Mar,15
1,01:22:21,yeahhh harryy bloody legend youuu,NEWANGEL93,Mon,Mar,15
2,01:22:21,pati members newsletter send inbox please chec...,uk2asia,Mon,Mar,15
3,01:22:21,point amp dim birthday dubnation,aldrinkidd_,Mon,Mar,15
4,01:22:20,know need hear bipartisanship pay bill put foo...,jhin58794874,Mon,Mar,15
...,...,...,...,...,...,...
10580,03:14:44,years ago psychiatrist tell time want stop pai...,trngpt21,Tue,Mar,9
10581,02:38:25,everyone want commit suicide,Sir_hungry,Tue,Mar,9
10582,00:37:33,bff commit suicide never saw come total shocker,lnunez_747,Tue,Mar,9
10583,23:03:56,well years ago entire world want force mentall...,marioelf1986,Mon,Mar,8


In [26]:
twitter['at_risk'] = 1

In [27]:
twitter.to_csv('/content/drive/MyDrive/makeathon_stuff/cleaned_twitter.csv')