# ETL Project

## Team: ETheL 

### Neil Patel, Caroline Miller, Ashley Fay

In [1]:
#Required imports

import pandas as pd
import numpy as np
import datetime as datetime
import json
from pandas.io.json import json_normalize 

In [2]:
#Load first data file, Amazon

file_1 = "amazon_com.csv"
amazon = pd.read_csv(file_1)
#amazon.head(2)

In [3]:
#Load second data file, Amazon with extras

file_2 = "amazon_com_extras.csv"
amazon_ex = pd.read_csv(file_2, encoding = "ISO-8859-1")

#Select columns desired for usable df
amazon_ex = amazon_ex[['ASIN','GROUP','FORMAT','TITLE','AUTHOR','PUBLISHER']]

#Make column headers lowercase
amazon_ex.columns = amazon_ex.columns.str.lower()

#amazon_ex.head(2)

In [4]:
#Load third data file, Netflix movies

file_3 = "netflix_movie_titles.csv"

#Apply encoding and code for file errors
netflix = pd.read_csv(file_3, encoding = "ISO-8859-1", header = None, engine = 'python', error_bad_lines=False)

#Rename column names from index based
netflix.rename(columns={0:'Count',1:'Year', 2:'Movie Title'}, inplace=True)


#Drop count
netflix.drop(columns=['Count'], inplace = True)

#netflix.head(2)

In [5]:
#Netflix file - Remove NaN and replace with 0 in the Year column
netflix['Year'].fillna(0, inplace = True)

#Change column from float to int
netflix['Year'].astype(int).astype(str)

netflix.dtypes

Year           float64
Movie Title     object
dtype: object

In [6]:
#Load fourth file, Movies_Metadata

file_4 = "movies_metadata.csv"

#Apply encoding and code for file errors
movies = pd.read_csv(file_4, encoding = "ISO-8859-1", engine = 'python', error_bad_lines=False)

#movies.head()

In [7]:
#Drop columns from the movies file not being used
movies.drop(columns=['adult','budget','homepage','popularity','id','production_countries','imdb_id','original_language','original_title','poster_path','production_companies','belongs_to_collection','runtime','revenue','overview','spoken_languages','video','vote_count'], inplace = True)
#movies.head()

In [8]:
#Merge movies file and amazon books file into 1 df by title
merged_df = pd.merge(amazon_ex, movies, on='title', how = 'inner')

#Make title column the index
#merged_df = merged_df.set_index('title')

#merged_df.head(2)

In [9]:
#Now merge Netflix file into df 
merged_df_2 = pd.merge(merged_df, netflix, left_on='title',right_on= 'Movie Title')
#merged_df_2.head()

In [10]:
#Drop duplicate titles from merged_df_2, keep first
merged_df_2.drop_duplicates('title', keep = 'first', inplace=True)
merged_df_2.head(1)

Unnamed: 0,asin,group,format,title,author,publisher,genres,release_date,status,tagline,vote_average,Year,Movie Title
0,1405246413,book,hardcover,Go,Diego,Go! Annual 2010,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",1999-04-09,Released,A weekend wasted is never a wasted weekend.,7.0,1999.0,Go


In [11]:
#Pull out title and genres column of merged_df_2 to work with dictionaries ect within the column
#Create new smaller df to work with
genre_df = merged_df_2[['title','genres']]

#Reset the index
genre_df = genre_df.reset_index(drop=True)

genre_df.head(1)

Unnamed: 0,title,genres
0,Go,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name..."


In [12]:
#Create itterator that adjusts for strings by taking length of frame throught the index.
d = {'id': [], 'name': [], 'title': []}
genre_df_out = pd.DataFrame()
for index in range(len(genre_df.genres)):
    item = genre_df.genres[index]
    title = genre_df.title[index]
    #print (title)
    dataItem = json.loads(item.replace("'",'"'))
    
    df_genre = pd.DataFrame(data=dataItem)
    new_bucket = []
    tempName = ''
    genre1 = ''
    genre2 = ''
    genre3 =''
    genre4 = ''
    tempID = ''
    for index2, row in df_genre.iterrows():
        #print(row["id"],row["name"],title)
        #name = name + row["name"]
        tempName = tempName + str(row["name"]) + ','
        tempID = tempID + str(row["id"]) + ','
        if(index2 == 0):genre1 = row["name"]
        if(index2 == 1):genre2 = row["name"]
        if(index2 == 2):genre3 = row["name"]
        if(index2 == 3):genre4 = row["name"]
    
    
    new_bucket.append({'id': tempID, 'genre1': genre1,'genre2': genre2,'genre3': genre3,'genre4': genre4, 'title': title})
    genre_df_out = genre_df_out.append(new_bucket)
    
    #print(genre_df_out)   

    
    

In [13]:
#Show final output of original genre column
genre_df_out.head(3)

Unnamed: 0,genre1,genre2,genre3,genre4,id,title
0,Crime,Comedy,Thriller,,803553,Go
0,Crime,Drama,Action,Thriller,80182853,Faster
0,Comedy,Drama,Romance,Music,35181074910402,Girl


In [14]:
#Merge output back with main df (merged_df_2)
final_df = pd.merge(merged_df_2, genre_df_out, on='title', how = 'inner')
#Drop Duplicates
final_df.drop_duplicates('title', keep = 'first', inplace = True)
#Drop initial genres column
final_df.drop(columns=['genres'], inplace = True)
#Rename title to book title
final_df = final_df.rename(columns = {'title': 'book title'})
#Set all columns to lowercase
final_df.columns = final_df.columns.str.lower()

#Reset Index
final_df = final_df.reset_index(drop=True)
final_df

Unnamed: 0,asin,group,format,book title,author,publisher,release_date,status,tagline,vote_average,year,movie title,genre1,genre2,genre3,genre4,id
0,1405246413,book,hardcover,Go,Diego,Go! Annual 2010,1999-04-09,Released,A weekend wasted is never a wasted weekend.,7.0,1999.0,Go,Crime,Comedy,Thriller,,803553
1,1498756301,book,hardcover,Faster,Better,Cheaper in the History of Manufacturing: From...,2010-11-23,Released,Slow Justice is no Justice,6.1,2003.0,Faster,Crime,Drama,Action,Thriller,80182853
2,1477117350,book,paperback,Girl,"You Ain't Gonna Make It""",: So They Said,1998-11-05,Released,A straight-A teen explores Seattle's rock scene.,4.7,1999.0,Girl,Comedy,Drama,Romance,Music,35181074910402
3,316199990,book,hardcover,Are We There Yet?,Dan Santat,"Little, Brown Books for Young Readers",2005-01-20,Released,24 hours. 350 miles. His girlfriend's kids. Wh...,5.2,2005.0,Are We There Yet?,Adventure,Comedy,Family,Romance,12351075110749
4,816680566,book,hardcover,Frozen,Mary Casanova,Univ Of Minnesota Press,2010-02-05,Released,No one knows you're up there,5.9,1997.0,Frozen,Thriller,,,,53
5,62194976,book,hardcover,Forbidden,Kimberley Griffiths Little,HarperCollins,1932-01-09,Released,Her Greatest Dramatic Role !,6.1,2000.0,Forbidden,Drama,,,,18
6,144240907X,book,hardcover,Fever,Lauren DeStefano,Simon & Schuster Children's Publishing,1999-05-16,Released,Who Can You Trust... When You No Longer Trust ...,6.0,1999.0,Fever,Drama,Thriller,,,1853
7,374378487,book,hardcover,Tuck Everlasting,Natalie Babbitt,"Farrar, Straus and Giroux",2002-10-11,Released,"If you could choose to live forever, would you?",6.4,2002.0,Tuck Everlasting,Drama,Family,Fantasy,Romance,18107511410749878
8,1250044669,book,hardcover,Renegades,Marissa Meyer,Feiwel & Friends,1989-06-02,Released,,5.2,1989.0,Renegades,Action,Crime,Thriller,,288053
9,1484722264,book,hardcover,Zero Day,Jan Gangsei,Disney-Hyperion,2003-09-03,Released,,7.1,2003.0,Zero Day,Drama,,,,18


In [15]:
#Write file to CSV
final_df.to_csv('master_data.csv')


####TEST CRAP####

In [16]:
#Set double column index by book and movie title
#final_df_2 = final_df.set_index(['book title','movie title'])
#final_df_2

In [17]:
#hello = genre_df.genres[0]
#b = hello.replace("'",'"')
#jdata2 = json.loads(b)

In [18]:
#jdata2

In [19]:
#Dummy DF
#d = {'id': [], 'name': [], 'title': []}
#d = ashley
#df_genre = pd.DataFrame(data=d)
#df_genre

In [20]:
#for item in genre_df.genres:
    #dataItem = json.loads(item.replace("'",'"'))
    #df_genre = pd.DataFrame(data=dataItem)
    #print (df_genre)

In [21]:
#def

#s = set()
#x = set()
#for value in genre_df.genres:
#    s = value
#    print (s)
#    tempVar = s.split("},")
#    print (tempVar)
#    for value2 in s:
#        x = value2
#        print(x)
   
    #print (x)
    #tempStr = value.replace("[","").replace("]","").replace("{","").replace("}","-").replace(" ","").replace("'","")
    #tempStr2 = tempStr.split("-")
    
    #print (tempStr2)

    
#for index in range(len(genre_df.genres)):
    #print (genre_df.genres[index])
    #print (genre_df.genres[index]["id"])
    #print (genre_df.title[index])
    
#hello = genre_df.genres[0]
#b = hello.replace("'",'"')
#jdata2 = json.loads(b)

In [22]:
#(merged_df_2.genres[0])

In [23]:
#(genre_df.title[0])

In [24]:
    #print (df_genre) 
    #print (genre_df.title[index])
    #d= df_genre
    #df_genre2 = pd.DataFrame(data=d)
    #print (df_genre)