In [29]:
import pandas as pd
import pickle

In [30]:
# retrieve twitter_archive_enhanced_clean_df 
# pickle read

twitter_archive_enhanced_clean_df = \
pd.read_pickle('twitter_archive_enhanced_clean_df.pkl')

In [31]:
twitter_archive_enhanced_clean_df.shape

(2175, 14)

## Assess

* Tidiness
  * Each variable forms a column
  * Each observation forms a row
  * Each type of observational unit forms a table  
<br>
- doggo, floofer, pupper, puppo pandas series, columns non compliant

## Clean

### Define

make doggo, floofer, pupper, puppo data tidy compliant  
Requirement - Each variable forms a column  
Create the required column - 'maturity'

### Code

In [32]:
twitter_archive_enhanced_clean_df["maturity"] = ''

### Test

In [33]:
twitter_archive_enhanced_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 15 columns):
tweet_id                 2175 non-null object
in_reply_to_status_id    78 non-null object
in_reply_to_user_id      78 non-null object
timestamp                2175 non-null datetime64[ns]
source                   2175 non-null object
text                     2175 non-null object
expanded_urls            2117 non-null object
rating_numerator         2175 non-null int64
rating_denominator       2175 non-null int64
name                     2175 non-null object
doggo                    2175 non-null object
floofer                  2175 non-null object
pupper                   2175 non-null object
puppo                    2175 non-null object
maturity                 2175 non-null object
dtypes: datetime64[ns](1), int64(2), object(12)
memory usage: 271.9+ KB


Success  
tidy compliant maturity column instantiated 

## Assess

Ambiguous (outlier) maturity rows, observations identified
* Example. 
  * tweet_id - 855851453814013952
  * doggo - puppo

## Clean

### Define

identify - remove - Clean - ambiguous maturity rows (outliers)  
capture unambiguous individual maturity stage counts  
individual maturity stage required for Tidy data compliance

### Code

In [34]:
print('twitter_archive_enhanced_clean_df.shape')
print(twitter_archive_enhanced_clean_df.shape)
print()

counter = 0
doggo_counter = 0
floofer_counter = 0
pupper_counter = 0
puppo_counter = 0
multiple_stage_delete_list = []

for index, row in twitter_archive_enhanced_clean_df.iterrows():
    counter += 1
    
    if row.floofer == 'None' and row.pupper == 'None'and \
    row.puppo == 'None' and row.doggo == 'None':
        # valid row, nothing to tidy
        pass
        
    elif row.floofer != 'None' and row.pupper == 'None'and \
    row.puppo == 'None' and row.doggo == 'None':
        # valid floofer row
        floofer_counter +=1 
        
    elif row.puppo != 'None' and row.doggo == 'None' and \
    row.floofer == 'None' and row.pupper == 'None':
        # valid puppo row
        puppo_counter +=1 

    elif row.doggo != 'None' and row.floofer == 'None' and\
    row.pupper == 'None' and row.puppo == 'None':        
        # valid doggo row
        doggo_counter +=1 
        
    elif row.pupper != 'None' and row.puppo == 'None' and\
    row.doggo == 'None' and row.floofer == 'None':     
        # valid pupper row
        pupper_counter +=1   
         
    else:
        # invalid maturity stage row
        # ambiguous maturity stage ambiguous
        multiple_stage_delete_list.append(row.tweet_id)
        
print('counter - {}'.format(counter))
print('doggo_counter - {}'.format(doggo_counter))
print('floofer_counter - {}'.format(floofer_counter))
print('pupper_counter - {}'.format(pupper_counter))
print('puppo_counter - {}\n'.format(puppo_counter))
print('multiple_stage_delete_list - {}'.format(multiple_stage_delete_list))
print('len(multiple_stage_delete_list) - {}\n'.format(len(multiple_stage_delete_list)))

# delete ambiguous maturity stage rows  
for tweet_id in multiple_stage_delete_list:
    twitter_archive_enhanced_clean_df = \
    twitter_archive_enhanced_clean_df\
    [twitter_archive_enhanced_clean_df.tweet_id != tweet_id]

twitter_archive_enhanced_clean_df.shape
(2175, 15)

counter - 2175
doggo_counter - 75
floofer_counter - 9
pupper_counter - 224
puppo_counter - 24

multiple_stage_delete_list - ['855851453814013952', '854010172552949760', '817777686764523521', '808106460588765185', '802265048156610565', '801115127852503040', '785639753186217984', '781308096455073793', '759793422261743616', '751583847268179968', '741067306818797568', '733109485275860992']
len(multiple_stage_delete_list) - 12



### Test

In [35]:
print('twitter_archive_enhanced_clean_df.shape')
print(twitter_archive_enhanced_clean_df.shape)
print()

twitter_archive_enhanced_clean_df.shape
(2163, 15)



Success  
12 ambiguous maturity rows removed  
math works: 2175 - 2163 = 12  
unambiguous individual maturity stage counts captured 

## Clean

### Define

populate the new 'maturity' pandas series, column

### Code

In [36]:
#
# function used to populate maturity column based on 
# doggo, floofer, pupper, puppo
#
def update_maturity(tweet_id, legacy_maturity_stage ):
    # print('um_tweet_id - {}'.format(tweet_id))
    # print('um_tweet_id - {}'.format(legacy_maturity_stage))

    twitter_archive_enhanced_clean_df.loc\
    [twitter_archive_enhanced_clean_df.tweet_id == tweet_id, 'maturity'] \
    =\
    twitter_archive_enhanced_clean_df.loc\
    [twitter_archive_enhanced_clean_df.tweet_id == tweet_id, legacy_maturity_stage]  
    return

In [37]:
#
# populate maturity column based on doggo, floofer, pupper, puppo
#

print('twitter_archive_enhanced_clean_df.maturity.value_counts()')
print(twitter_archive_enhanced_clean_df.maturity.value_counts())
print()

counter = 0
doggo_counter = 0
floofer_counter = 0
pupper_counter = 0
puppo_counter = 0

for index, row in twitter_archive_enhanced_clean_df.iterrows():
    counter += 1
    
    tweet_id = row.tweet_id
 
    if row.floofer == 'None' and row.pupper == 'None'and \
    row.puppo == 'None' and row.doggo == 'None':
        # valid row, nothing to tidy
        pass
        
    elif row.floofer != 'None' and row.pupper == 'None'and \
    row.puppo == 'None' and row.doggo == 'None':
        # valid floofer row
        update_maturity(tweet_id, row.floofer)
    elif row.puppo != 'None' and row.doggo == 'None' and \
    row.floofer == 'None' and row.pupper == 'None':
        # valid puppo row
        puppo_counter +=1
        update_maturity(tweet_id, row.puppo)
 
    # elif row.doggo != 'doggo' and row.doggo != 'None':
    elif row.doggo != 'None' and row.floofer == 'None' and\
    row.pupper == 'None' and row.puppo == 'None':        
        # valid doggo row
        doggo_counter +=1
        update_maturity(tweet_id, row.doggo)
          
    elif row.pupper != 'None' and row.puppo == 'None' and\
    row.doggo == 'None' and row.floofer == 'None':     
        # valid pupper row
        pupper_counter +=1 
        update_maturity(tweet_id, row.pupper)
        
    else:
        print('tweet_id - {}'.format(tweet_id))
        sys.exit('ambiguous maturity stage found') 

twitter_archive_enhanced_clean_df.maturity.value_counts()
    2163
Name: maturity, dtype: int64



### Test

In [38]:
print('twitter_archive_enhanced_clean_df.maturity.value_counts()')
print(twitter_archive_enhanced_clean_df.maturity.value_counts())

twitter_archive_enhanced_clean_df.maturity.value_counts()
           1831
pupper      224
doggo        75
puppo        24
floofer       9
Name: maturity, dtype: int64


Success - tidy counts match previous untidy counts
* pre counts (multiple untidy maturity stage columns)
  * doggo_counter - 75
  * floofer_counter - 9
  * pupper_counter - 224
  * puppo_counter - 24
  

* post tidy counts
  * doggo        75
  * floofer       9
  * pupper      224
  * puppo        24


## Clean

### Define

drop legacy individual columns

### Code

In [39]:
print('column names - {}'.format(list(twitter_archive_enhanced_clean_df)))

twitter_archive_enhanced_clean_df =\
twitter_archive_enhanced_clean_df.drop('doggo', axis=1)
twitter_archive_enhanced_clean_df =\
twitter_archive_enhanced_clean_df.drop('floofer', axis=1)
twitter_archive_enhanced_clean_df =\
twitter_archive_enhanced_clean_df.drop('pupper', axis=1)
twitter_archive_enhanced_clean_df =\
twitter_archive_enhanced_clean_df.drop('puppo', axis=1)

column names - ['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp', 'source', 'text', 'expanded_urls', 'rating_numerator', 'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo', 'maturity']


### Test

In [40]:
print('column names - {}'.format(list(twitter_archive_enhanced_clean_df)))

column names - ['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp', 'source', 'text', 'expanded_urls', 'rating_numerator', 'rating_denominator', 'name', 'maturity']


Success  
* individual maturity centric columns removed
* Tidy compliance achieved 

In [41]:
# Save cleaned pandas DataFrame
with open('twitter_archive_enhanced_clean_df.pkl', 'wb') as f:
    pickle.dump(twitter_archive_enhanced_clean_df, f)


In [42]:
# Read - make sure we can read it
with open('twitter_archive_enhanced_clean_df.pkl', 'rb') as f:
    twitter_archive_enhanced_clean_df = pickle.load(f)

print('twitter_archive_enhanced_clean_df.shape')
print(twitter_archive_enhanced_clean_df.shape)
print() 

twitter_archive_enhanced_clean_df.shape
(2163, 11)

