# Data Cleaning Notebook

In [20]:
# Import the required libraries
import pandas as pd
import numpy as np


In [21]:
# Functions for data cleaning


#  Functions for Duplicate checks 
def get_exact_dups(df):
    dups = df[df.duplicated()]
    return dups

def get_tweet_text_dups(df, col_names):
    dups = df[df.duplicated(subset=col_names)]
    return dups



In [22]:
raw_data = pd.read_csv('../data/crowdflower-brands-and-product-emotions/data/judge_1377884607_tweet_product_company.csv')

raw_data.shape

(8721, 3)

In [23]:
raw_data.head(3)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion


## Original Data Column Descriptions

* tweet_text - Text/content of the Tweet
* emotion_in_tweet_is_directed_at - Brand/Product Category (Apple Product or Google Product)
* is_there_an_emotion_directed_at_a_brand_or_product - Human labeled emotion/sentiment of the Tweet text, Positive/Negative/Neutral


### Make a deep copy of the original data and rename the columns for readablilty/ease of typing

## Renamed Data columns
* tweet_text 
* brand_or_product
* emotion

In [24]:
# Make a deep copy before any data cleaning (Deep copy has own copy of data and index)
processed_data = raw_data.copy(deep=True)

In [25]:
# Rename those verbose columns!
processed_data.rename(columns={'emotion_in_tweet_is_directed_at': 'labeled_brand_or_product', 'is_there_an_emotion_directed_at_a_brand_or_product': 'detected_emotion'}, inplace=True)
processed_data.head(1)

Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion


## Data cleaning steps:
* __Duplicate check/ Duplicate handling__ 

* __Null check/Null handling__

* __Data type conversion__

In [26]:
# Duplicate check and handling

# We do have EXACT duplicates?
exact_dups = get_exact_dups(processed_data)
print(len(exact_dups))

# Drop 'em
processed_data.drop_duplicates(inplace=True)

22


In [27]:
# Do we have duplicated tweet texts?
tweet_dups = get_tweet_text_dups(processed_data, ['tweet_text'])
print(len(tweet_dups))

# Yes!  What is going on here?
display(tweet_dups)

print(tweet_dups['tweet_text'].unique())

5


Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion
3628,Win free ipad 2 from webdoc.com #sxsw RT,iPad,Positive emotion
3629,Win free iPad 2 from webdoc.com #sxsw RT,iPad,Positive emotion
6063,RT @mention Marissa Mayer: Google Will Connect...,,No emotion toward brand or product
6064,RT @mention Marissa Mayer: Google Will Connect...,,No emotion toward brand or product
6345,RT @mention RT @mention It's not a rumor: Appl...,,No emotion toward brand or product


['Win free ipad 2 from webdoc.com #sxsw RT'
 'Win free iPad 2 from webdoc.com #sxsw RT'
 'RT @mention Marissa Mayer: Google Will Connect the Digital &amp; Physical Worlds Through Mobile - {link} #sxsw'
 'RT @mention Marissa Mayer: Google Will Connect the Digital &amp; Physical Worlds Through Mobile - {link} #SXSW'
 "RT @mention RT @mention It's not a rumor: Apple is opening up a temporary store in downtown Austin for #SXSW and the iPad 2 launch {link}"]


In [28]:
'''
It looks like pandas is doing something interesting with checking for duplicate text values.

3628 and 3629 have ALL the same words but a small difference in case (ipad vs iPad)
6063, 6064 have ALL the same words but a small difference in case (#sxsw vs #SXSW)
6345 - Not sure why this was returned as a duplicate. Will keep it. 

Overall- there are very few duplicated tweets.  Just keep them all.
'''


'\nIt looks like pandas is doing something interesting with checking for duplicate text values.\n\n3628 and 3629 have ALL the same words but a small difference in case (ipad vs iPad)\n6063, 6064 have ALL the same words but a small difference in case (#sxsw vs #SXSW)\n6345 - Not sure why this was returned as a duplicate. Will keep it. \n\nOverall- there are very few duplicated tweets.  Just keep them all.\n'

In [29]:
# Null checks

# What percentage of data is null?
display(processed_data.isnull().mean())

tweet_text                  0.000115
labeled_brand_or_product    0.636740
detected_emotion            0.000000
dtype: float64

We have some null tweet_text values.  Drop these rows as we have to have text to classify.


In [32]:
processed_data.dropna(subset=['tweet_text'],inplace=True)

~64% of the brand_or_product data is null.  What to do? 

My text classifier will only be interested in postitive or negative sentiment on a product.

Don't drop any rows. Fill the null with UNKNOWN for now.

In [34]:
processed_data['labeled_brand_or_product'].fillna('UNKNOWN', inplace=True)
display(processed_data.isnull().mean())

tweet_text                  0.0
labeled_brand_or_product    0.0
detected_emotion            0.0
dtype: float64

In [35]:
# Data type conversion - make sure that tweet text is a string.... ran in issue with finding  a float value
processed_data['tweet_text'] = processed_data['tweet_text'].astype(str)

### New Column Creation
* sentiment - Positive/Negative/Neutral based off of __detected_emotion__ column values.
* brand - Apple/Google/UNKNOWN based off of __brand_or_product__column values.
* tweet_char_count - Total number of character count in Tweet

In [36]:
# Create new cols based of existing values. What are the unique values we have to work with?
print(processed_data['detected_emotion'].unique())
print(processed_data['labeled_brand_or_product'].unique())


['Negative emotion' 'Positive emotion'
 'No emotion toward brand or product' "I can't tell"]
['iPhone' 'iPad or iPhone App' 'iPad' 'Google' 'UNKNOWN' 'Android' 'Apple'
 'Android App' 'Other Google product or service'
 'Other Apple product or service']


In [37]:
# New column creation functions

def get_sentiment(detected_emotion_str):
    sentiment = 'Neutral'
    if detected_emotion_str == 'Positive emotion':
        sentiment = 'Positive'
    elif detected_emotion_str == 'Negative emotion':
        sentiment = 'Negative'
    return sentiment

def get_brand(labeled_brand_or_product_str, tweet_text_str):
    brand = 'UNKNOWN'
    if ((labeled_brand_or_product_str.lower().__contains__('google')) or (labeled_brand_or_product_str.lower().__contains__('android'))):
        brand = 'Google'
    elif ((labeled_brand_or_product_str.lower().__contains__('apple')) or (labeled_brand_or_product_str.lower().__contains__('ip'))):
        brand = 'Apple'

    # next level of trying detect brand if it was not labeled in the original data
    # if both brand identifiers are in the tweet text...the first match (google) will be brand
    if (brand == 'UNKNOWN'):
        tweet_lc = tweet_text_str.lower()
        has_google_identifiers = (tweet_lc.__contains__('google')) or (tweet_lc.lower().__contains__('android'))
        has_apple_identifers = (tweet_lc.__contains__('apple')) or (tweet_lc.__contains__('ip'))

        if (has_google_identifiers and has_apple_identifers):
            brand ='BOTH'
        elif (has_google_identifiers):
            brand = 'Google'
        elif (has_apple_identifers):
            brand = 'Apple'

    return brand

def get_tweet_char_count(tweet_text_str):
    return len(tweet_text_str.strip())

In [38]:
# Create the new sentiment column
processed_data['sentiment'] = processed_data.apply(lambda row: get_sentiment(row['detected_emotion']), axis=1)
processed_data.head(2)

Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Negative
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Positive


In [39]:
# Create the new brand column
processed_data['brand'] = processed_data.apply(lambda row: get_brand(row['labeled_brand_or_product'], row['tweet_text']), axis=1)
processed_data.tail(2)

Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion,sentiment,brand
8719,Some Verizon iPhone customers complained their...,UNKNOWN,No emotion toward brand or product,Neutral,Apple
8720,�ϡ�����_��ʋ�΋�ҋ�������⋁_��������_���RT @mentio...,UNKNOWN,No emotion toward brand or product,Neutral,Google


In [40]:
processed_data['tweet_char_count'] = processed_data.apply(lambda row: get_tweet_char_count(row['tweet_text']), axis=1)
processed_data.head(2)

Unnamed: 0,tweet_text,labeled_brand_or_product,detected_emotion,sentiment,brand,tweet_char_count
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Negative,Apple,127
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Positive,Apple,139


In [41]:
# Save the processed data to files 
# ALL tweets
processed_data.to_csv('../data/cleaned_tweets_all.csv', index_label='id')
# Positive tweets
processed_data[processed_data['sentiment'] == 'Positive'].to_csv('../data/cleaned_tweets_positive.csv', index_label='id')
# Negative tweets
processed_data[processed_data['sentiment'] == 'Negative'].to_csv('../data/cleaned_tweets_negative.csv', index_label='id')
# Neutral tweets
processed_data[processed_data['sentiment'] == 'Neutral'].to_csv('../data/cleaned_tweets_neutral.csv', index_label='id')