# Data Cleaning Notebook

In [1]:
# Import the required libraries
import pandas as pd
import numpy as np


In [2]:
raw_sxsw = pd.read_csv('../data/judge_1377884607_tweet_product_company.csv')
display(raw_sxsw.head(2))
raw_sxsw.shape

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion


(8721, 3)

In [3]:
raw_apple = pd.read_csv('../data/apple-twitter-sentiment-texts.csv')
print(raw_apple.shape)
raw_apple.head()

(1630, 2)


Unnamed: 0,text,sentiment
0,Wow. Yall needa step it up @Apple RT @heynyla:...,-1
1,What Happened To Apple Inc? http://t.co/FJEX...,0
2,Thank u @apple I can now compile all of the pi...,1
3,The oddly uplifting story of the Apple co-foun...,0
4,@apple can i exchange my iphone for a differen...,0


In [4]:
# Make a deep copy before any data cleaning (Deep copy has own copy of data and index)
sxsw = raw_sxsw.copy(deep=True)
apple = raw_apple.copy(deep=True)

In [5]:
# Rename columns!
sxsw.rename(columns={'emotion_in_tweet_is_directed_at': 'labeled_brand_or_product', 'is_there_an_emotion_directed_at_a_brand_or_product': 'detected_emotion'}, inplace=True)


In [6]:
sxsw.head(1)

Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion


In [7]:
apple.head(1)

Unnamed: 0,text,sentiment
0,Wow. Yall needa step it up @Apple RT @heynyla:...,-1


## Data cleaning steps:
* __Null check/Null handling__

* __Data type conversion__

In [8]:
#drop any null tweet text records from datasets
sxsw.dropna(subset=['tweet_text'],inplace=True)
apple.dropna(subset=['text'],inplace=True)

In [9]:
# Data type conversion - make sure that tweet text is a string.... ran in issue with finding  a float value
sxsw['tweet_text'] = sxsw['tweet_text'].astype(str)
apple['text'] = apple['text'].astype(str)

### New Column Creation
* human_sentiment - Positive/Negative/Neutral based off of __detected_emotion__ column values. <<---Labeled by Humans!
* brand - Apple/Google/UNKNOWN based off of __brand_or_product__column values.

In [10]:
# Create new cols based of existing values. What are the unique values we have to work with?
print(sxsw['detected_emotion'].unique())
print(apple['sentiment'].unique())


['Negative emotion' 'Positive emotion'
 'No emotion toward brand or product' "I can't tell"]
[-1  0  1]


In [11]:
# New column creation functions

def get_sentiment_for_apple(sentiment_number):
    sentiment = 'neutral'
    if sentiment_number == 1:
        sentiment = 'positive'
    elif sentiment_number == -1:
        sentiment = 'negative'
    return sentiment

def get_sentiment(detected_emotion_str):
    sentiment = 'neutral'
    if detected_emotion_str == 'Positive emotion':
        sentiment = 'positive'
    elif detected_emotion_str == 'Negative emotion':
        sentiment = 'negative'
    return sentiment


In [12]:
# Create the new sentiment columns
sxsw['human_sentiment'] = sxsw.apply(lambda row: get_sentiment(row['detected_emotion']), axis=1)
sxsw.head(2)

Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion,human_sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,positive


In [13]:
apple['human_sentiment'] = apple.apply(lambda row: get_sentiment_for_apple(row['sentiment']), axis=1)
apple.head(2)

Unnamed: 0,text,sentiment,human_sentiment
0,Wow. Yall needa step it up @Apple RT @heynyla:...,-1,negative
1,What Happened To Apple Inc? http://t.co/FJEX...,0,neutral


In [14]:
# Save the processed data to files 
# ALL tweets
sxsw.to_csv('../data/prepped_sxsw_tweets.csv', index_label='id')
apple.to_csv('../data/prepped_apple_tweets.csv', index_label='id')