# Special Offer Cleaning

Cleaning the special offer dataset in preparation for baseline modeling.

## Package Import

In [60]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re #regex

%matplotlib inline

## Data Import & Cleaning

In [61]:
df = pd.read_csv("spec_offer_train.csv")

In [62]:
df.head()

Unnamed: 0,subject,spec_offer,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,"M. Frank, explore courses to prepare for your ...",1,,1.0,2116.0
1,FREE SHIPPING on 1000s of styles 50xe2x80x9370...,1,,0.0,1789.0
2,"Dress up/down khaki shorts xe2x80x94BTW, every...",1,,,
3,Last chance! Free shipping + 50xe2x80x9370% of...,1,,,
4,Fw: [External Email]Moms Climb for Free to Cel...,1,,,


In [63]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [64]:
df.head()

Unnamed: 0,subject,spec_offer
0,"M. Frank, explore courses to prepare for your ...",1
1,FREE SHIPPING on 1000s of styles 50xe2x80x9370...,1
2,"Dress up/down khaki shorts xe2x80x94BTW, every...",1
3,Last chance! Free shipping + 50xe2x80x9370% of...,1
4,Fw: [External Email]Moms Climb for Free to Cel...,1


# Remove leftover b from bytes object

In [65]:
df.subject[29].startswith('b')

True

In [66]:
df.subject[29][0]

'b'

In [67]:
df.subject[29]

'bFree shipping on everythingxe2x80x94and its all 50% off'

In [68]:
df.subject[29].replace('b','',1)

'Free shipping on everythingxe2x80x94and its all 50% off'

In [69]:
df[df.subject.str.startswith('b') == True]

Unnamed: 0,subject,spec_offer
29,bFree shipping on everythingxe2x80x94and its a...,1
33,bClocks ticking: 50% off everything + free shi...,1
51,bWhats better than food? FREE food!,1
166,bIts back: 20% off everything! xf0x9fx8ex89,1
177,bxe2x8fxb3xe2x8cx9b Times running out! Online ...,1
...,...,...
4063,bMeet this years Top 20 DIY Costumes!,0
4086,bThese rewards wont last!,1
4133,"bMatthew, heres how your FICOxc2xae Score is a...",0
4194,"bOkay, heres the deal with ink and toner",0


In [70]:
for i in df.index:
    if df.subject[i].startswith('b') == True:
        df.at[i, 'subject'] = df.subject[i].replace("b",'',1)
    else:
        pass

In [71]:
df[df.subject.str.startswith('b') == True]

Unnamed: 0,subject,spec_offer


## Advanced Emoji Cleaning

In [72]:
emoji_lib = pd.read_csv("emoji_lib_expanded.csv")

In [73]:
emoji_lib.head()

Unnamed: 0,hex_codepoint,status,emoji,e_version,description,utf8_bytes,utf8_str,utf8_clean,unicode_bytes,unicode_str
0,1F600,fully-qualified,😀,E1.0,grinning face,b'\xf0\x9f\x98\x80',\xf0\x9f\x98\x80,xf0x9fx98x80,b'\\U0001f600',\\U0001f600
1,1F603,fully-qualified,😃,E0.6,grinning face with big eyes,b'\xf0\x9f\x98\x83',\xf0\x9f\x98\x83,xf0x9fx98x83,b'\\U0001f603',\\U0001f603
2,1F604,fully-qualified,😄,E0.6,grinning face with smiling eyes,b'\xf0\x9f\x98\x84',\xf0\x9f\x98\x84,xf0x9fx98x84,b'\\U0001f604',\\U0001f604
3,1F601,fully-qualified,😁,E0.6,beaming face with smiling eyes,b'\xf0\x9f\x98\x81',\xf0\x9f\x98\x81,xf0x9fx98x81,b'\\U0001f601',\\U0001f601
4,1F606,fully-qualified,😆,E0.6,grinning squinting face,b'\xf0\x9f\x98\x86',\xf0\x9f\x98\x86,xf0x9fx98x86,b'\\U0001f606',\\U0001f606


In [74]:
def emoji_in_num_name(df, emoji_lib):
    """
    Identifies emojis in utf-8 codepoints, unicode codepoints or native format and creates new emoji features in your df.
    
    Input: DataFrame with 'subject' feature
    
    Output: New DataFrame with three new features.
        - 'emoji_in': True is an emoji is present in the subject, else False
        - 'emoji_num': the number of emojis in the subject, 0 is no emoji is present
        - 'emoji_name': the names of the emojis in the subject, 'None' if no emoji is present
    
    """
    
    # check each subject line if it contains an emoji listed in the library.
    # add all emojis to a dataframe called emojis
    for i in df.index:
        utf_8 = [ele for ele in emoji_lib['utf8_clean'] if(ele in df.subject[i])]
        native = [ele for ele in emoji_lib['emoji'] if(ele in df.subject[i])]
        emojis = pd.DataFrame(set(utf_8 + native), columns = ['emo'])
        
        # check the length of emoji list to create a new feature called 'emoji-in' that contains:
        # False if no emoji is in a subject line
        # True if an emoji is in a subject line
        if len(emojis['emo']) == 0:
            df.at[i, 'emoji_in'] = False
        else:
            df.at[i, 'emoji_in'] = True
        
        # add the number of emojis in the subject line into a new feature called 'emoji_num'
        df.at[i, 'emoji_num'] = len(emojis['emo'])
        
        #create an empty dataframe that mirrors the emoji library dataset
        emoji_ind = pd.DataFrame(columns = ['hex_codepoint',	'status',	'emoji',	'e_version',	'description',	'utf8_bytes',	'utf8_str',	'utf8_clean',	'unicode_bytes',	'unicode_str'])
        
        # append the rows associated with the subject line emojis to the empty dataframe
        for x in emojis.index:
            emoji_ind = pd.concat([emoji_ind, emoji_lib[emoji_lib['utf8_clean'] == emojis.emo[x]]])

        # grab the names of the emojis and insert as a string into a new feature called 'emoji_name'
        df.at[i, 'emoji_name'] = str(list(emoji_ind['description']))
        
        # remove the utf-8 code point from the subject line
        if len(emoji_ind.utf8_clean) == 0:
            pass
        else:
            for y in emoji_ind.index:
                df.at[i, 'subject'] = df.subject[i].replace(emoji_ind.utf8_clean[y], "")
        
        # if there is no emoji name, replace the empty list with 'None'
        # else clean the string emoji names by removing the list brackets.
        if df['emoji_name'][i] == '[]':
            df.at[i, 'emoji_name'] = "None"
        else:
            df.at[i, 'emoji_name'] = df.emoji_name[i].replace('[','')
            df.at[i, 'emoji_name'] = df.emoji_name[i].replace(']','')
                
    return df

In [75]:
df = emoji_in_num_name(df, emoji_lib)

In [76]:
df.head()

Unnamed: 0,subject,spec_offer,emoji_in,emoji_num,emoji_name
0,"M. Frank, explore courses to prepare for your ...",1,False,0.0,
1,FREE SHIPPING on 1000s of styles 50xe2x80x9370...,1,False,0.0,
2,"Dress up/down khaki shorts xe2x80x94BTW, every...",1,False,0.0,
3,Last chance! Free shipping + 50xe2x80x9370% of...,1,False,0.0,
4,Fw: [External Email]Moms Climb for Free to Cel...,1,False,0.0,


In [77]:
df[df.emoji_in == True]

Unnamed: 0,subject,spec_offer,emoji_in,emoji_num,emoji_name
30,The coziest hoodies + joggers for you xe2x80x...,1,True,2.0,"'rainbow', 'volcano'"
33,Clocks ticking: 50% off everything + free ship...,1,True,1.0,'alarm clock'
65,Your order ships freexe2x80x94no minimum!,1,True,3.0,"'shopping bags', 'shopping bags', 'couch and l..."
73,Last chance! Free shipping + extra 45% off ev...,1,True,1.0,'alarm clock'
81,Deals on deals on deals: free shipping + 60% ...,1,True,3.0,"'shopping bags', 'shopping bags', 'couch and l..."
...,...,...,...,...,...
4196,Health at Home: Have a Hauntingly Healthy Hall...,0,True,1.0,'ghost'
4207,xefxb8x8f The one sentence thatxe2x80x99s made...,0,True,3.0,"'victory hand', 'victory hand', 'raised hand'"
4209,5 See your weekly highlights from Simply Piano,0,True,2.0,"'violin', 'musical note'"
4218,Want a chance to win? 0,1,True,2.0,"'waving hand', 'eyes'"


## UTF-8 Punctuation Cleaning

In [78]:
char_set = pd.read_pickle("spec_chars.pkl")

In [79]:
char_set.head(10)

Unnamed: 0,type,character,utf-8,description
0,punctuation,-,xe2x80x93,dash
1,punctuation,',xe2x80x99,single quote
2,punctuation,,xc2xa0,space
3,punctuation,,xefxb8x8f,space
4,punctuation,-,xe2x80x94,dash
5,punctuation,"""",xe2x80x9c,double quote
6,punctuation,"""",xe2x80x9d,double quote
7,punctuation,¢,xc2xa2,cents
8,punctuation,…,xe2x80xa6,ellipsis


In [80]:
def char_in_num_name(df, char_set):
    """
    Identifies special characters in utf-8 codepoints and replaces with actual character.
    
    Input: DataFrame with 'subject' feature
    
    Output: New DataFrame with three new features.
        - 'emoji_in': True is an emoji is present in the subject, else False
        - 'emoji_num': the number of emojis in the subject, 0 is no emoji is present
        - 'emoji_name': the names of the emojis in the subject, 'None' if no emoji is present
    
    """
    
    # check each subject line if it contains a special character listed in the library.
    # add all characters to a dataframe called chars
    for i in df.index:
        utf_8 = [ele for ele in char_set['utf-8'] if(ele in df.subject[i])]
        chars = pd.DataFrame(set(utf_8), columns = ['char'])
        
        # check the length of chars list to create a new feature called 'chars-in' that contains:
        # False if no character is in a subject line
        # True if a character is in a subject line
        if len(chars['char']) == 0:
            df.at[i, 'chars_in'] = False
        else:
            df.at[i, 'chars_in'] = True
        
        # add the number of characters in the subject line into a new feature called 'char_num'
        df.at[i, 'char_num'] = len(chars['char'])
        
        #create an empty dataframe that mirrors the character set library dataset
        chars_ind = pd.DataFrame(columns = ['type', 'character', 'utf-8', 'description'])
        
        # append the rows associated with the subject line characters to the empty dataframe
        for x in chars.index:
            chars_ind = pd.concat([chars_ind, char_set[char_set['utf-8'] == chars.char[x]]])

        # grab the names of the characters and insert as a string into a new feature called 'char_name'
        df.at[i, 'char_name'] = str(list(chars_ind['description']))
        
        # replace the utf-8 code point with the actual character from the subject line
        if len(chars_ind['utf-8']) == 0:
            pass
        else:
            for y in chars_ind.index:
                df.at[i, 'subject'] = df.subject[i].replace(chars_ind['utf-8'][y], chars_ind['character'][y])
        
        # if there is no character name, replace the empty list with 'None'
        # else clean the string character names by removing the list brackets.
        if df['char_name'][i] == '[]':
            df.at[i, 'char_name'] = "None"
        else:
            df.at[i, 'char_name'] = df.char_name[i].replace('[','')
            df.at[i, 'char_name'] = df.char_name[i].replace(']','')
            
    return df

In [81]:
df = char_in_num_name(df, char_set)

In [82]:
df.head(50)

Unnamed: 0,subject,spec_offer,emoji_in,emoji_num,emoji_name,chars_in,char_num,char_name
0,"M. Frank, explore courses to prepare for your ...",1,False,0.0,,False,0.0,
1,FREE SHIPPING on 1000s of styles 50-70% off,1,False,0.0,,True,1.0,'dash'
2,"Dress up/down khaki shorts -BTW, everything sh...",1,False,0.0,,True,1.0,'dash'
3,Last chance! Free shipping + 50-70% off almost...,1,False,0.0,,True,1.0,'dash'
4,Fw: [External Email]Moms Climb for Free to Cel...,1,False,0.0,,False,0.0,
5,Only a Few Hours Left! Get a Free Childrens or...,1,False,0.0,,False,0.0,
6,Up to 70% off + FREE shipping,1,False,0.0,,False,0.0,
7,Your $40 GapCash gift (and free shipping!),1,False,0.0,,False,0.0,
8,Your free meal is waiting,1,False,0.0,,False,0.0,
9,"M. Frank, redeem these three (3) unlocked cour...",1,False,0.0,,False,0.0,


In [84]:
df.to_csv('spec_offer_train_cleaned.csv', index=False)

## Check Datatypes

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4229 entries, 0 to 4228
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   subject     4229 non-null   object 
 1   spec_offer  4229 non-null   int64  
 2   emoji_in    4229 non-null   object 
 3   emoji_num   4229 non-null   float64
 4   emoji_name  4229 non-null   object 
 5   chars_in    4229 non-null   object 
 6   char_num    4229 non-null   float64
 7   char_name   4229 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 264.4+ KB


In [86]:
type(df.subject[0])

str

In [87]:
type(df.spec_offer[0])

numpy.int64

In [88]:
df['spec_offer'] = df['spec_offer'].astype(int)

In [89]:
type(df.spec_offer[0])

numpy.int64

## Remove Unnecessary Spaces

Removing spaces at the beginning and end of each subject line.

In [90]:
df.subject = df.subject.str.strip()

Removing spaces surrounding punction marks.

In [91]:
def pun_before(text):
    updated = re.sub(r"\s([?.!’,:\s|$])", r"\1", text)
    return updated

def pun_after(text):
    updated = re.sub(r'''([’])\s''', r'\1', text)
    return updated

def fix_pun(text):
    before = pun_before(text)
    after = pun_after(before)
    return after

In [92]:
df['subject'] = df.subject.apply(fix_pun)

## Lowercase

In [93]:
df['subject'] = df['subject'].apply(lambda x: x.lower())

In [94]:
df.head()

Unnamed: 0,subject,spec_offer,emoji_in,emoji_num,emoji_name,chars_in,char_num,char_name
0,"m. frank, explore courses to prepare for your ...",1,False,0.0,,False,0.0,
1,free shipping on 1000s of styles 50-70% off,1,False,0.0,,True,1.0,'dash'
2,"dress up/down khaki shorts -btw, everything sh...",1,False,0.0,,True,1.0,'dash'
3,last chance! free shipping + 50-70% off almost...,1,False,0.0,,True,1.0,'dash'
4,fw: [external email]moms climb for free to cel...,1,False,0.0,,False,0.0,


## Expand Contractions

In [95]:
cList = {
"ain’t": "is not",
"aren’t": "are not",
"can’t": "cannot",
"can’t’ve": "cannot have",
"’cause": "because",
"could’ve": "could have",
"couldn’t": "could not",
"couldn’t’ve": "could not have",
"didn’t": "did not",
"doesn’t": "does not",
"don’t": "do not",
"hadn’t": "had not",
"hadn’t’ve": "had not have",
"hasn’t": "has not",
"haven’t": "have not",
"he’d": "he would",
"he’d’ve": "he would have",
"he’ll": "he will",
"he’ll’ve": "he he will have",
"he’s": "he is",
"how’d": "how did",
"how’d’y": "how do you",
"how’ll": "how will",
"how’s": "how is",
"I’d": "I would",
"I’d’ve": "I would have",
"I’ll": "I will",
"I’ll’ve": "I will have",
"I’m": "I am",
"I’ve": "I have",
"i’d": "i would",
"i’d’ve": "i would have",
"i’ll": "i will",
"i’ll’ve": "i will have",
"i’m": "i am",
"i’ve": "i have",
"isn’t": "is not",
"it’d": "it would",
"it’d’ve": "it would have",
"it’ll": "it will",
"it’ll’ve": "it will have",
"it’s": "it is",
"let’s": "let us",
"ma’am": "madam",
"mayn’t": "may not",
"might’ve": "might have",
"mightn’t": "might not",
"mightn’t’ve": "might not have",
"must’ve": "must have",
"mustn’t": "must not",
"mustn’t’ve": "must not have",
"needn’t": "need not",
"needn’t’ve": "need not have",
"o’clock": "of the clock",
"oughtn’t": "ought not",
"oughtn’t’ve": "ought not have",
"shan’t": "shall not",
"sha’n’t": "shall not",
"shan’t’ve": "shall not have",
"she’d": "she would",
"she’d’ve": "she would have",
"she’ll": "she will",
"she’ll’ve": "she will have",
"she’s": "she is",
"should’ve": "should have",
"shouldn’t": "should not",
"shouldn’t’ve": "should not have",
"so’ve": "so have",
"so’s": "so as",
"that’d": "that would",
"that’d’ve": "that would have",
"that’s": "that is",
"there’d": "there would",
"there’d’ve": "there would have",
"there’s": "there is",
"they’d": "they would",
"they’d’ve": "they would have",
"they’ll": "they will",
"they’ll’ve": "they will have",
"they’re": "they are",
"they’ve": "they have",
"to’ve": "to have",
"wasn’t": "was not",
"we’d": "we would",
"we’d’ve": "we would have",
"we’ll": "we will",
"we’ll’ve": "we will have",
"we’re": "we are",
"we’ve": "we have",
"weren’t": "were not",
"what’ll": "what will",
"what’ll’ve": "what will have",
"what’re": "what are",
"what’s": "what is",
"what’ve": "what have",
"when’s": "when is",
"when’ve": "when have",
"where’d": "where did",
"where’s": "where is",
"where’ve": "where have",
"who’ll": "who will",
"who’ll’ve": "who will have",
"who’s": "who is",
"who’ve": "who have",
"why’s": "why is",
"why’ve": "why have",
"will’ve": "will have",
"won’t": "will not",
"won’t’ve": "will not have",
"would’ve": "would have",
"wouldn’t": "would not",
"wouldn’t’ve": "would not have",
"y’all": "you all",
"y’all’d": "you all would",
"y’all’d’ve": "you all would have",
"y’all’re": "you all are",
"y’all’ve": "you all have",
"you’d": "you would",
"you’d’ve": "you would have",
"you’ll": "you will",
"you’ll’ve": "you will have",
"you’re": "you are",
"you’ve": "you have"
}

In [96]:
contractions_re = re.compile('(%s)' % '|'.join(cList.keys()))
def expand_contractions(s, contractions_dict=cList):
     def replace(match):
         return cList[match.group(0)]
     return contractions_re.sub(replace, s)

In [97]:
df['subject'] = df.subject.apply(lambda x: expand_contractions(x, contractions_dict=cList))

In [98]:
df.to_csv('spec_offer_train_cleaned.csv', index=False)