## Data Cleaning
After combining web-scraped depression related tweets, now we go ahead and clean the dataset in order to prepare for VADER-based sentiment analysis.

In [2]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('depressive_tweets_conbined.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,time,tweet,hashtags,cashtags
0,0,0,1.15135e+18,21:25:13,"Wow, my dad yday: “you don’t take those stupid...",[],[]
1,1,1,1.15135e+18,21:25:07,what part of this was really harmfult of a lot...,[],[]
2,2,2,1.15135e+18,21:25:06,one of the ways I got through my #depression i...,"['#depression', '#uncoveringthenewu', '#change...",[]
3,3,3,1.15135e+18,21:24:55,see i wanna do one of them but they all say th...,[],[]
4,4,4,1.15135e+18,21:24:51,IS IT clinical depression or is it the palpabl...,[],[]


In [5]:
import nltk
nltk.download(['punkt','stopwords'])
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\M&D\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\M&D\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249118 entries, 0 to 249117
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    249118 non-null  int64 
 1   Unnamed: 0.1  135497 non-null  object
 2   id            249108 non-null  object
 3   time          249108 non-null  object
 4   tweet         249095 non-null  object
 5   hashtags      249082 non-null  object
 6   cashtags      249082 non-null  object
dtypes: int64(1), object(6)
memory usage: 13.3+ MB


In [7]:
df_new = df[df['tweet'].notnull()]
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 249095 entries, 0 to 249117
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Unnamed: 0    249095 non-null  int64 
 1   Unnamed: 0.1  135474 non-null  object
 2   id            249095 non-null  object
 3   time          249095 non-null  object
 4   tweet         249095 non-null  object
 5   hashtags      249082 non-null  object
 6   cashtags      249082 non-null  object
dtypes: int64(1), object(6)
memory usage: 15.2+ MB


In [8]:
df_new['clean_tweet'] = df_new['tweet'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))
df_new.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,time,tweet,hashtags,cashtags,clean_tweet
0,0,0,1.15135e+18,21:25:13,"Wow, my dad yday: “you don’t take those stupid...",[],[],"Wow, dad yday: “you don’t take stupid depressi..."
1,1,1,1.15135e+18,21:25:07,what part of this was really harmfult of a lot...,[],[],part really harmfult lot people went every gui...
2,2,2,1.15135e+18,21:25:06,one of the ways I got through my #depression i...,"['#depression', '#uncoveringthenewu', '#change...",[],one ways I got #depression learning dance rain...
3,3,3,1.15135e+18,21:24:55,see i wanna do one of them but they all say th...,[],[],"see wanna one say PTSD, depression, and/or anx..."
4,4,4,1.15135e+18,21:24:51,IS IT clinical depression or is it the palpabl...,[],[],IS IT clinical depression palpable hopelessnes...


In [9]:
df_new['vader_score'] = df_new['clean_tweet'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [10]:
df_new.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,time,tweet,hashtags,cashtags,clean_tweet,vader_score
0,0,0,1.15135e+18,21:25:13,"Wow, my dad yday: “you don’t take those stupid...",[],[],"Wow, dad yday: “you don’t take stupid depressi...",-0.4122
1,1,1,1.15135e+18,21:25:07,what part of this was really harmfult of a lot...,[],[],part really harmfult lot people went every gui...,-0.8074
2,2,2,1.15135e+18,21:25:06,one of the ways I got through my #depression i...,"['#depression', '#uncoveringthenewu', '#change...",[],one ways I got #depression learning dance rain...,0.3382
3,3,3,1.15135e+18,21:24:55,see i wanna do one of them but they all say th...,[],[],"see wanna one say PTSD, depression, and/or anx...",-0.4588
4,4,4,1.15135e+18,21:24:51,IS IT clinical depression or is it the palpabl...,[],[],IS IT clinical depression palpable hopelessnes...,-0.8316


In [12]:
positive_num = len(df_new[df_new['vader_score'] >=0.05])
#neutral_num = len(df_new[(df_new['vaderReviewScore'] >-0.05) & (df_new['vaderReviewScore']<0.05)])
negative_num = len(df_new[df_new['vader_score']<0.05])
positive_num, negative_num

(53996, 195099)

In [13]:
df_new['vader_sentiment_label']= df_new['vader_score'].map(lambda x:int(1) if x>=0.05 else int(0))
df_new.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,time,tweet,hashtags,cashtags,clean_tweet,vader_score,vader_sentiment_label
0,0,0,1.15135e+18,21:25:13,"Wow, my dad yday: “you don’t take those stupid...",[],[],"Wow, dad yday: “you don’t take stupid depressi...",-0.4122,0
1,1,1,1.15135e+18,21:25:07,what part of this was really harmfult of a lot...,[],[],part really harmfult lot people went every gui...,-0.8074,0
2,2,2,1.15135e+18,21:25:06,one of the ways I got through my #depression i...,"['#depression', '#uncoveringthenewu', '#change...",[],one ways I got #depression learning dance rain...,0.3382,1
3,3,3,1.15135e+18,21:24:55,see i wanna do one of them but they all say th...,[],[],"see wanna one say PTSD, depression, and/or anx...",-0.4588,0
4,4,4,1.15135e+18,21:24:51,IS IT clinical depression or is it the palpabl...,[],[],IS IT clinical depression palpable hopelessnes...,-0.8316,0


In [14]:
df_new.drop(['Unnamed: 0.1', 'id', 'time', 'tweet', ], axis = 1, inplace = True)
df_new.head()

Unnamed: 0.1,Unnamed: 0,hashtags,cashtags,clean_tweet,vader_score,vader_sentiment_label
0,0,[],[],"Wow, dad yday: “you don’t take stupid depressi...",-0.4122,0
1,1,[],[],part really harmfult lot people went every gui...,-0.8074,0
2,2,"['#depression', '#uncoveringthenewu', '#change...",[],one ways I got #depression learning dance rain...,0.3382,1
3,3,[],[],"see wanna one say PTSD, depression, and/or anx...",-0.4588,0
4,4,[],[],IS IT clinical depression palpable hopelessnes...,-0.8316,0


In [15]:
df_new = df_new[['Unnamed: 0', 'vader_sentiment_label', 'vader_score', 'clean_tweet']]
df_new.head()

Unnamed: 0.1,Unnamed: 0,vader_sentiment_label,vader_score,clean_tweet
0,0,0,-0.4122,"Wow, dad yday: “you don’t take stupid depressi..."
1,1,0,-0.8074,part really harmfult lot people went every gui...
2,2,1,0.3382,one ways I got #depression learning dance rain...
3,3,0,-0.4588,"see wanna one say PTSD, depression, and/or anx..."
4,4,0,-0.8316,IS IT clinical depression palpable hopelessnes...


In [16]:
df_new.to_csv('vader_processed_depressed_tweets.csv')