## Data cleaning

In [1]:
import pandas as pd
pd.options.display.max_columns=100
pd.options.display.max_rows=300
import numpy as np
import pleiades as ple
import json
import re

### Load user data

In [2]:
import_path = r'..\data\#michellewilliams_users.csv'
data = pd.read_csv(import_path, low_memory=False)

In [3]:
data.head()

Unnamed: 0,user.id_str,user.name,user.screen_name,user.description,conservative
0,753643604426489856,Phenice McCall#BorisIsMyPrimeMinister✡️🇬🇧,PheniceMccall,,1.0
1,1104467684,🇺🇸Michael Hays🇺🇸,MichaelHays13,Red Blooded!! All American!! 🌾Heartland born &...,1.0
2,106310615,Manasi Scott,ManasiScott,Indian singer. Army brat. Muay Thai panda. Hor...,0.0
3,2993975016,Steve Rabon,srrabon_,"Christian, SC Gamecocks , College Football, ...",1.0
4,954190967422242817,Bonnie Kranick,BonnieKranick,"Mother of 3 Amma to 3 love my kids, love my gr...",1.0


In [4]:
data.shape

(8381, 5)

In [5]:
data.isnull().sum()

user.id_str            0
user.name              2
user.screen_name       0
user.description    1311
conservative           0
dtype: int64

### Clean user data

Impute null values

In [6]:
data = data.fillna('')

In [7]:
df = data[['user.name', 'user.description', 'conservative']]

Remove unwanted elements from text.

In [8]:
cz = ple.CZ()

In [9]:
emoji_path = r'..\emojis.json'
with open(emoji_path) as f:
    emojis = json.load(f)

In [10]:
emoji_pattern = '(' + '|'.join(emojis) + ')'
hastag_pattern = r'(#\w+\b)'
hastag_seperator = r'#([0-9A-Z]+[a-z]*)+'

twitter_dict = {
    emoji_pattern: r' \1 ',
    hastag_pattern: r' \1 ',
}

replacement = 'genderpronouns'
pronoun_dict = {k: replacement for k in cz.re_ref['gender_pronouns']}

leftover_dict = {
    r'\bKAG2020\b': 'KAG',
    r'\b[0-9a-zA-Z]\b': '',
    r'\s+': ' '
}

In [11]:
def remove_non_emojis_and_hastags(sentence):
    words = sentence.split()
    clean_sentence = ''
    for word in words:
        if re.match(emoji_pattern, word):
            clean_sentence = clean_sentence + ' ' + word
        elif re.match(hastag_pattern, word):
            word = re.sub('#', '', word)
            splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([0-9A-Z]+)', r' \1', word)).split()
            clean_sentence = clean_sentence + ' ' + ' '.join(splitted)
    return clean_sentence

Transform user names.

In [12]:
print('before:', df['user.name'][1])
df['user.name'] = cz.text_list_cleaner(
    df['user.name'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict, # space out emojis and hashtags.
    remove_non_emojis_and_hastags, # removes non emojis and hastags, then splits hashtags in CamelCase into individual words.
    cz.remove_punctuation, # removes punctunation.
    leftover_dict) # remove single letters, numbers and excess whitespaces.
print('after:', df['user.name'][1])

before: 🇺🇸Michael Hays🇺🇸
after:  🇺🇸 🇺🇸


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Now to process user descriptions.

In [13]:
def decompose_hastags(sentence):
    words = sentence.split()
    clean_sentence = ''
    for word in words:
        if re.match(hastag_pattern, word):
            word = re.sub('#', '', word)
            splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([0-9A-Z]+)', r' \1', word)).split()
            clean_sentence = clean_sentence + ' ' + ' '.join(splitted)
        else:
            clean_sentence = clean_sentence + ' ' + word
    return clean_sentence

In [14]:
print('before:', df['user.description'][1])
df['user.description'] = cz.text_list_cleaner(
    df['user.description'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict, # space out emojis and hastags.
    decompose_hastags, # splits hashtags in CamelCase into individual words.
    cz.remove_punctuation, # removes punctunation.
    leftover_dict)
print('after:', df['user.description'][1])

before: Red Blooded!! All American!! 🌾Heartland born & raised. ⭐️ #MAGA #KAG2020 ⭐️
after:  Red Blooded All American 🌾 Heartland born raised ⭐ ️ MAGA KAG ⭐ ️


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [15]:
df['name_and_description'] = df['user.name'] + ' ' + df['user.description']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Strip leading and trailing spaces.

In [16]:
df['name_and_description'] = df['name_and_description'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Save cleaned data into file.

In [17]:
export_path = r'..\data\#michellewilliams_users_clean.csv'
df[['name_and_description', 'conservative']].to_csv(export_path)

### Load validation data

In [18]:
import_path = r'..\data\replies_to_jk_rowling_users.csv'
data = pd.read_csv(import_path, low_memory=False)

In [19]:
data.head()

Unnamed: 0,user.id_str,user.name,user.screen_name,user.description,conservative
0,1091971984659890176,Michael Jefferson,Michael70234589,That's ignorant \nYou're ignorant,0.0
1,146446713,human,amoozeboosh,And I'm back in the room. Last seen on Twitter...,0.0
2,426236968,kj 👽✌,VanillaCreme96,23 | Infant teacher | Part-time magical girl |...,0.0
3,140096386,Keene Maburger🔶,keenemaverick,"Gamer. Communist. He/him, but barely. Also spa...",0.0
4,1195478151914303489,Katie,Katie15230082,May as well just change my name to Kath at thi...,0.0


In [20]:
data.shape

(947, 5)

In [21]:
data.isnull().sum()

user.id_str           0
user.name             0
user.screen_name      0
user.description    106
conservative          0
dtype: int64

### Clean user data

Impute null values

In [22]:
data = data.fillna('')

In [23]:
df = data[['user.name', 'user.description', 'conservative']]

Remove unwanted elements from text.

Transform user names.

In [24]:
print('before:', df['user.name'][2])
df['user.name'] = cz.text_list_cleaner(
    df['user.name'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict, # space out emojis and hashtags.
    remove_non_emojis_and_hastags, # removes non emojis and hastags, then splits hashtags in CamelCase into individual words.
    cz.remove_punctuation, # removes punctunation.
    leftover_dict) # remove single letters, numbers and excess whitespaces.
print('after:', df['user.name'][2])

before: kj 👽✌
after:  👽 ✌


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Now to process user descriptions.

In [25]:
print('before:', df['user.description'][0])
df['user.description'] = cz.text_list_cleaner(
    df['user.description'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict, # space out emojis and hastags.
    decompose_hastags, # splits hashtags in CamelCase into individual words.
    cz.remove_punctuation, # removes punctunation.
    leftover_dict)
print('after:', df['user.description'][0])

before: That's ignorant 
You're ignorant
after:  That is ignorant You are ignorant


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [26]:
df['name_and_description'] = df['user.name'] + ' ' + df['user.description']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Strip leading and trailing spaces.

In [27]:
df['name_and_description'] = df['name_and_description'].str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Save cleaned data into file.

In [28]:
export_path = r'..\data\replies_to_jk_rowling_users_clean.csv'
df[['name_and_description', 'conservative']].to_csv(export_path)