In [86]:
# pip install -r requirements.txt

import pandas as pd
import numpy as np

from os import walk, path
from typing import Iterable, Union

In [90]:
EXTENSIONS = {
    "t": lambda file: pd.read_csv(file, sep="\t"),
    "c": lambda file: pd.read_csv(file, sep=","),
    "s": lambda file: pd.read_csv(file, sep=" "),
    "j": lambda file: pd.read_json(file)
}

def read_data(files: Union[Iterable[str], str]):
    if isinstance(files, str):
        files = [files]
    
    supported_files = (
        file for file in files
        if file.split(".")[-1][0] in EXTENSIONS
    )
    
    data_frames = [
        EXTENSIONS[file.split(".")[-1][0]](file)
        for file in supported_files
    ]

    unsupported_files = (
        file for file in files
        if file.split(".")[-1][0] not in EXTENSIONS
    )

    for file in unsupported_files:
        print(f"Unsupported file extension: {file.split('.')[-1]}", flush=True)

    return data_frames


In [99]:
root, dirs, files = next(walk("data"))
print(files)
files = (path.join(root, file) for file in files)

data = read_data(files)

predictions_df, tweet_json_df, twitter_enhanced_df = data



['image-predictions.tsv', 'tweet-json.json', 'twitter-archive-enhanced.csv']


# After I loaded the data successfully, It's time to merge them to reduce duplicates

In [105]:
predictions_df.rename(columns={"tweet_id": "id"}, inplace=True)
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       2075 non-null   int64  
 1   jpg_url  2075 non-null   object 
 2   img_num  2075 non-null   int64  
 3   p1       2075 non-null   object 
 4   p1_conf  2075 non-null   float64
 5   p1_dog   2075 non-null   bool   
 6   p2       2075 non-null   object 
 7   p2_conf  2075 non-null   float64
 8   p2_dog   2075 non-null   bool   
 9   p3       2075 non-null   object 
 10  p3_conf  2075 non-null   float64
 11  p3_dog   2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [106]:
tweet_json_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   created_at                     2354 non-null   datetime64[ns, UTC]
 1   id                             2354 non-null   int64              
 2   id_str                         2354 non-null   int64              
 3   full_text                      2354 non-null   object             
 4   truncated                      2354 non-null   bool               
 5   display_text_range             2354 non-null   object             
 6   entities                       2354 non-null   object             
 7   extended_entities              2073 non-null   object             
 8   source                         2354 non-null   object             
 9   in_reply_to_status_id          78 non-null     float64            
 10  in_reply_to_status_id_st

In [108]:
twitter_enhanced_df.rename(columns={"tweet_id": "id"}, inplace=True)
twitter_enhanced_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        1611 non-null   object 
 13  doggo                       97 no

In [111]:
merge_on_columns = ["id"]

# Merge DataFrames
df = pd.merge(predictions_df, tweet_json_df, on=merge_on_columns, how="inner")
df = pd.merge(df, twitter_enhanced_df, on=merge_on_columns, how="inner")

# Display the first few rows of the merged DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2073 entries, 0 to 2072
Data columns (total 58 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   id                             2073 non-null   int64              
 1   jpg_url                        2073 non-null   object             
 2   img_num                        2073 non-null   int64              
 3   p1                             2073 non-null   object             
 4   p1_conf                        2073 non-null   float64            
 5   p1_dog                         2073 non-null   bool               
 6   p2                             2073 non-null   object             
 7   p2_conf                        2073 non-null   float64            
 8   p2_dog                         2073 non-null   bool               
 9   p3                             2073 non-null   object             
 10  p3_conf                 

# Now after the merge we need to inspect the data carefully to find any problems

In [112]:
print(df.columns)

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')


In [121]:
print(df["doggo"].unique())
print(df["puppo"].unique())
print(df["pupper"].unique())
print(df["floofer"].unique())
print(df[df["pupper"] == "pupper"]["name"].unique())

[nan 'doggo']
[nan 'puppo']
[nan 'pupper']
[nan 'floofer']
['Roscoe' 'Gus' 'a' nan 'Ginger' 'Jed' 'Sierra' 'Rover' 'Jamesy' 'Boomer'
 'Gidget' 'Pickles' 'Clark' 'Ava' 'Kona' 'Gabe' 'Chelsea' 'Dido'
 'Herschel' 'Cooper' 'Craig' 'Ollie' 'Bones' 'Sampson' 'Bo' 'Baloo'
 'Laika' 'Ralphy' 'mad' 'Pinot' 'Sophie' 'Huck' 'Winnie' 'Willem' 'just'
 'Louie' 'Eve' 'Maggie' 'Jax' 'Lola' 'Milo' 'Malcolm' 'Zoe' 'Finn'
 'actually' 'Edmund' 'Aqua' 'Benji' 'Rory' 'Buckley' 'Rooney' 'all'
 'Rueben' 'Lillie' 'Ashleigh' 'Luther' 'Oliver' 'Derek' 'Clyde' 'Smokey'
 'Ozzy' 'Winston' 'Chuckles' 'Finnegus' 'Adele' 'Ambrose' 'Ralphson'
 'Scooter' 'Sansa' 'Millie' 'Misty' 'Trevith' 'Lorenzo' 'Grady' 'Phil'
 'Wally' 'Gizmo' 'Lucy' 'Trip' 'Hamrick' 'Lizzie' 'Blakely' 'Curtis'
 'Mona' 'Olivia' 'Birf' 'the' 'Marty' 'Otis' 'Hubertson' 'Gerbald'
 'Sweets' 'Banjo' 'Brandy' 'Larry' 'Patrick' 'Charlie' 'Toffee' 'Ellie'
 'Hector' 'Kawhi' 'Rinna' 'Dwight' 'Bella' 'Godzilla' 'Pepper' 'Zuzu'
 'Mollie' 'Superpup' 'Rufio' 'Lenno