## Data Preprocessing

In [1]:
import pandas as pd
pd.options.display.max_columns=100
pd.options.display.max_rows=300
import numpy as np
import pleiades as ple
import json
import re

Load user data

In [2]:
import_path = r'..\data\#michellewilliams_users.csv'
data = pd.read_csv(import_path, low_memory=False)

In [3]:
data.head()

Unnamed: 0,user.id_str,user.name,user.screen_name,user.description,conservative
0,753643604426489856,Phenice McCall#BorisIsMyPrimeMinister✡️🇬🇧,PheniceMccall,,1.0
1,1104467684,🇺🇸Michael Hays🇺🇸,MichaelHays13,Red Blooded!! All American!! 🌾Heartland born &...,1.0
2,106310615,Manasi Scott,ManasiScott,Indian singer. Army brat. Muay Thai panda. Hor...,0.0
3,2993975016,Steve Rabon,srrabon_,"Christian, SC Gamecocks , College Football, ...",1.0
4,954190967422242817,Bonnie Kranick,BonnieKranick,"Mother of 3 Amma to 3 love my kids, love my gr...",1.0


In [4]:
data.isnull().sum()

user.id_str            0
user.name              2
user.screen_name       0
user.description    1311
conservative           0
dtype: int64

Fill null values

In [5]:
data = data.fillna('')

Separate #hastags and emojis from text

In [6]:
df = data[['user.name', 'user.description']]

In [7]:
cz = ple.CZ()

In [10]:
import emoji

In [11]:
df = data[['user.name', 'user.description']][:3]

In [23]:
emoji_pattern = '(' + '|'.join(emoji.UNICODE_EMOJI.keys()) + ')'

twitter_dict = {
    emoji_pattern: r' \1 ',
}

In [24]:
print('before:', df['user.name'][1])
df['user.name'] = cz.text_list_cleaner(
    df['user.name'].copy(),
    twitter_dict) # space out emojis and hastags.
print('after:', df['user.name'][1])

before:  caught  caught Michael Hays caught  caught 


error: nothing to repeat at position 2740

In [8]:
emoji_path = r'..\data\emojis.json'
with open(emoji_path) as f:
    emojis = json.load(f)
re_emojis = '|'.join(emojis)

In [9]:
emoji_pattern = '(' + '|'.join(emojis) + ')'
hastag_pattern = r'(#\w+\b)'
hastag_seperator = r'#([0-9A-Z]+[a-z]*)+'

twitter_dict = {
    emoji_pattern: r' \1 ',
    hastag_pattern: r' \1 ',
}

In [11]:
replacement = 'genderpronouns'
pronoun_dict = {k: replacement for k in cz.re_ref['gender_pronouns']}

In [12]:
print('before:', df['user.name'][1])
df['user.name'] = cz.text_list_cleaner(
    df['user.name'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict) # space out emojis and hastags.
print('after:', df['user.name'][1])

before: 🇺🇸Michael Hays🇺🇸
after:  🇺🇸 Michael Hays 🇺🇸 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Remove all non hastag and emojis text from names.

In [19]:
for i, sentence in enumerate(df['user.name']):
    words = sentence.split()
    clean_sentence = ''
    for word in words:
        if re.match(emoji_pattern, word):
            clean_sentence = clean_sentence + ' ' + word
        elif re.match(hastag_pattern, word):
            word = re.sub('#', '', word)
            splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([0-9A-Z]+)', r' \1', word)).split()
            clean_sentence = clean_sentence + ' ' + ' '.join(splitted)
    df['user.name'][i] = clean_sentence


In [22]:
# Remove punctuation.
df['user.name'] = cz.text_list_cleaner(df['user.name'].copy(), cz.remove_punctuation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Now to process user descriptions.

In [None]:
print('before:', df['user.description'][1])
df['user.description'] = cz.text_list_cleaner(
    df['user.description'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict) # space out emojis and hastags.
print('after:', df['user.description'][1])

In [33]:
df_test = df[['user.description']]

In [34]:
print('before:', df_test['user.description'][1])
df_test['user.description'] = cz.text_list_cleaner(
    df_test['user.description'].copy(),
    cz.contractions, # remove contractions.
    cz.re_ref['email'], # remove emails.
    cz.re_ref['links'], # remove links.
    pronoun_dict, # condense gender pronouns.
    twitter_dict) # space out emojis and hastags.
print('after:', df_test['user.description'][1])

before: Red Blooded!! All American!! 🌾Heartland born & raised. ⭐️ #MAGA #KAG2020 ⭐️
after: Red Blooded!! All American!!  🌾 Heartland born & raised.  ⭐  ️   #MAGA   #KAG2020   ⭐  ️ 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [35]:
for i, sentence in enumerate(df_test['user.description']):
    words = sentence.split()
    clean_sentence = ''
    for word in words:
        if re.match(hastag_pattern, word):
            word = re.sub('#', '', word)
            splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([0-9A-Z]+)', r' \1', word)).split()
            clean_sentence = clean_sentence + ' ' + ' '.join(splitted)
        else:
            clean_sentence = clean_sentence + ' ' + word
    df_test['user.description'][i] = clean_sentence

In [39]:
df_test['user.description']

0                                                        
1        Red Blooded!! All American!! 🌾 Heartland born...
2        Indian singer. Army brat. Muay Thai panda. Ho...
3        Christian, SC Gamecocks , College Football, O...
4        Mother of 3 Amma to 3 love my kids, love my g...
                              ...                        
8376                             Always Half-in Half-Out.
8377                    Nothing happens after you die....
8378     ACTOR. Writer|Producer. Luso|Latinx. Educator...
8379                                            ✌ 🏻 ❤ ️ 🌻
8380     Founding Partner, MirRam Group; Publisher, Ma...
Name: user.description, Length: 8381, dtype: object

In [40]:
# Remove punctuation.
df_test['user.description'] = cz.text_list_cleaner(df_test['user.description'].copy(), cz.remove_punctuation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [41]:
df_test['user.description']

0                                                        
1        Red Blooded!! All American!! 🌾 Heartland born...
2        Indian singer. Army brat. Muay Thai panda. Ho...
3        Christian, SC Gamecocks , College Football, O...
4        Mother of 3 Amma to 3 love my kids, love my g...
                              ...                        
8376                             Always Half-in Half-Out.
8377                    Nothing happens after you die....
8378     ACTOR. Writer|Producer. Luso|Latinx. Educator...
8379                                            ✌ 🏻 ❤ ️ 🌻
8380     Founding Partner, MirRam Group; Publisher, Ma...
Name: user.description, Length: 8381, dtype: object

In [13]:
import string

In [14]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [None]:
r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [50]:
X = data['user.description'][0:3]
X[2]

'Indian singer. Army brat. Muay Thai panda. Horse-rider. Traveller. Sole keeper of Zephan. For bookings contact@manasiscott.com | https://t.co/MU3LedSHkC'

In [36]:
emoji_path = r'..\data\emojis.json'
with open(emoji_path) as f:
    emojis = json.load(f)

In [37]:
re_emojis = '|'.join(emojis)

In [38]:
emoji_pattern = '(' + '|'.join(emojis) + ')'
hastag_pattern = r'(#\w+\b)'

In [None]:
replacement = 'gender_pronouns'
pronouns_dict = {
    r'[hH]e/[hH]im': replacement,
    r'[sS]he/[hH]er': replacement,
    r'[hH]e/[hH]im': replacement,
    r'[hH]e/[hH]im': replacement,
    r'[hH]e/[hH]im': replacement,
}

In [39]:
twitter_dict = {
    emoji_pattern: r' \1 ',
    hastag_pattern: r' \1 ',
    r'born'
}

In [40]:
print('before:', X[1])
X = cz.text_list_cleaner(X, twitter_dict)
print('after:', X[1])

before: Red Blooded!! All American!! 🌾Heartland born & raised. ⭐️ #MAGA #KAG2020 ⭐️
after: Red Blooded!! All American!!  🌾 Heartland born & raised.  ⭐  ️   #MAGA  #KAG2020  ⭐  ️ 


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
# cvec = CountVectorizer(max_df=0.5, max_features=3000, min_df=2,
#                        ngram_range=(1, 2), stop_words='english')
cvec = CountVectorizer(token_pattern=r'[^\s]+')
X_cvec = cvec.fit_transform(X)
X_cvec = pd.DataFrame(X_cvec.toarray(), columns=cvec.get_feature_names())
print('CountVectorizer:')
print(X_cvec.sum().sort_values(ascending=False)[:])
print()

CountVectorizer:
⭐️                         2
🌾heartland                 1
https://t.co/mu3ledshkc    1
#maga                      1
&                          1
all                        1
american!!                 1
army                       1
blooded!!                  1
bookings                   1
born                       1
brat.                      1
contact@manasiscott.com    1
for                        1
horse-rider.               1
indian                     1
keeper                     1
muay                       1
of                         1
panda.                     1
raised.                    1
red                        1
singer.                    1
sole                       1
thai                       1
traveller.                 1
zephan.                    1
|                          1
#kag2020                   1
dtype: int64



In [52]:
X_cvec

Unnamed: 0,#kag2020,#maga,&,all,american!!,army,blooded!!,bookings,born,brat.,contact@manasiscott.com,for,horse-rider.,https://t.co/mu3ledshkc,indian,keeper,muay,of,panda.,raised.,red,singer.,sole,thai,traveller.,zephan.,|,⭐️,🌾heartland
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,2,1
2,0,0,0,0,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0


In [None]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('dt', ExtraTreesClassifier(n_estimators=100))
])
params = {
    'tvec__stop_words': ['english'],
    'tvec__ngram_range': [(1, 1), (1, 2)],
    'tvec__max_df': [.5, .7, .9],
    'tvec__min_df': [2, 4, 6],
    'tvec__max_features': [2000, 3000, 4000],
}

Copy classifications onto raw data.

In [20]:
data['conservative'] = data2['conservative']

Drop unclassified rows

In [21]:
data = data.dropna(subset=['conservative'])

In [22]:
data.shape

(9036, 354)

Extract desired features for next step.

In [31]:
features = ['user.id_str', 'user.description', 'conservative']

In [32]:
df = data[features]

Drop duplicate users.

In [34]:
df = df.drop_duplicates(subset='user.id_str')

In [35]:
df.shape

(8381, 3)

In [36]:
export_path = r'..\data\#michellewilliams_users.csv'
df.to_csv(export_path, index=False)

In [30]:
df['user.id_str'][0]

753643604426489856

In [4]:
nabe = ple.Nabe()

In [15]:
data = data.dropna(subset=['conservative'])

In [16]:
data.shape

(9036, 354)

In [None]:
export_path = r'..\data\#michellewilliams.csv'
data.to_csv(export_path, index=False)

In [12]:
data['conservative'] = data2['conservative']

In [11]:
nabe.get_nulls(data2)

{'retweeted_status.id_str': 3005,
 'retweeted_status.full_text': 12757,
 'conservative': 4416}

In [36]:
data['conservative'] = data2['conservative']

In [39]:
data = data.dropna(subset=['conservative'])

In [40]:
data.shape

(9036, 354)

In [42]:
data['conservative'].value_counts()

0.0    5031
1.0    4005
Name: conservative, dtype: int64

In [36]:
n = 4012

In [37]:
data.loc[n]

created_at                                                                  Thu Dec 19 23:32:52 +0000 2019
id                                                                                     1207806033495764992
id_str                                                                                 1207806033495764992
full_text                                                @kateolivieri @matumazza @jk_rowling Are you s...
truncated                                                                                            False
display_text_range                                                                                [37, 74]
source                                                   <a href="http://twitter.com/download/iphone" r...
in_reply_to_status_id                                                                          1.20767e+18
in_reply_to_status_id_str                                                              1207671591049605124
in_reply_to_user_id                  

In [38]:
t = data['full_text'][n]
t

'@kateolivieri @matumazza @jk_rowling Are you sure you live in this planet?'

In [781]:
data['conservative'] = [1 if x == t else y for x, y in zip(data['full_text'], data['conservative'])]

In [39]:
export_path = r'..\data\replies_to_jk_rowling_classify3.csv'
data.to_csv(export_path, index=False)

In [798]:
import json

In [822]:
import_path = r'..\data\replies_to_jk_rowling.json'
with open(import_path) as f:
    j = json.load(f)
    data = pd.io.json.json_normalize(j)

# import_path = r'..\data\#michellewilliams_classify2.csv'
# data2 = pd.read_json(import_path, low_memory=False)

In [823]:
data.shape

(5726, 180)

In [803]:
data.columns

Index(['created_at', 'id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       ...
       'quoted_status.place.id', 'quoted_status.place.url',
       'quoted_status.place.place_type', 'quoted_status.place.name',
       'quoted_status.place.full_name', 'quoted_status.place.country_code',
       'quoted_status.place.country', 'quoted_status.place.contained_within',
       'quoted_status.place.bounding_box.type',
       'quoted_status.place.bounding_box.coordinates'],
      dtype='object', length=180)

In [810]:
export_path = r'..\data\replies_to_jk_rowling.csv'
data.to_csv(export_path, index=False)

In [808]:
data['retweeted']

0       False
1       False
2       False
3       False
4       False
        ...  
5721    False
5722    False
5723    False
5724    False
5725    False
Name: retweeted, Length: 5726, dtype: bool

In [829]:
df = data[(data['lang'] == 'en') | (data['lang'] == 'und')]

In [830]:
df.shape

(5361, 180)

In [831]:
df = df[['id_str', 'full_text']]

In [832]:
export_path = r'..\data\replies_to_jk_rowling_classify2.csv'
df.to_csv(export_path, index=False)

In [838]:
import_path = r'..\data\replies_to_jk_rowling_classify3.xlsx'
data2 = pd.read_excel(import_path)

In [839]:
data2.shape

(5726, 3)

In [840]:
df.shape

(5361, 2)

In [843]:
df2 = pd.merge(df, data2, how='inner', on='id_str')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [842]:
df2.shape

(3303, 4)

In [814]:
export_path = r'..\data\replies_to_jk_rowling_classify.csv'
df.to_csv(export_path, index=False)

In [52]:
df['full_text'][0]

'RT @CriticsChoice: #Michellewilliams wins the #criticschoice Award for Best Actress In A Limited Series Or Movie Made For Television for he…'

In [None]:
df['full_text'][0]

In [86]:
df.shape

(13482, 4)

In [66]:
df = df[df['full_text'] != 'RT @CriticsChoice: #Michellewilliams wins the #criticschoice Award for Best Actress In A Limited Series Or Movie Made For Television for he…']

In [67]:
df.shape

(13482, 4)

In [87]:
df.loc[1008]

id_str                                                      1215470667661004800
full_text                     RT @V_actually: Hollywood's full of sexual dev...
retweeted_status.id_str                                             1.21529e+18
retweeted_status.full_text    Hollywood's full of sexual deviants &amp; pedo...
Name: 1008, dtype: object

In [88]:
df['full_text'][1008]

"RT @V_actually: Hollywood's full of sexual deviants &amp; pedophiles that have sold their souls for fame.\n\nIt’s amazing the amount of brainwash…"

In [92]:
df['conservative'] = df['full_text'].map({'RT @V_actually: Hollywood\'s full of sexual deviants &amp; pedophiles that have sold their souls for fame.\n\nIt’s amazing the amount of brainwash…': 1})

In [93]:
df['conservative'].count()

506

In [94]:
export_path = r'..\data\#michellewilliams_classify3.csv'
df.to_csv(export_path, index=False)

In [95]:
import_path = r'..\data\#michellewilliams_classify3.csv'
data2 = pd.read_csv(import_path, low_memory=False)

In [98]:
data2.loc[1205]

id_str                                                      1215316789548199936
full_text                     RT @americansunited: S/O to #MichelleWilliams ...
retweeted_status.id_str                                             1.21468e+18
retweeted_status.full_text                                                  NaN
conservative                                                                NaN
Name: 1205, dtype: object

In [78]:
data2['id_str'][232]

1216078555152617472

In [104]:
import_path = r'..\data\#michellewilliams_classify5.csv'
data3 = pd.read_csv(import_path, low_memory=False)

In [101]:
data3['id_str'][232]

1.22e+18

In [102]:
data3.columns

Index(['id_str', 'full_text', 'retweeted_status.id_str',
       'retweeted_status.full_text', 'conservative'],
      dtype='object')

In [114]:
data2['conservative'] = data3['conservative']

In [115]:
data2['full_text'][101]

"RT @RockofLifeNI: At tonight's #goldenglobes2020 pregnant actress #MichelleWilliams said she wouldn't have won if she hadn't aborted her ot…"

In [118]:
data2['conservative'] = [1 if x == "RT @RockofLifeNI: At tonight's #goldenglobes2020 pregnant actress #MichelleWilliams said she wouldn't have won if she hadn't aborted her ot…" else y for x, y in zip(data2['full_text'], data2['conservative'])]

In [119]:
export_path = r'..\data\#michellewilliams_classify7.csv'
data2.to_csv(export_path, index=False)