In [6]:
import pandas as pd
import numpy as np
import json
from ast import literal_eval
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [7]:
#brings in the dfs, cleans them

df1 = pd.read_csv("tmdb_5000_movies.csv")
df1['release_date'] = pd.to_datetime(df1['release_date']).apply(lambda x: x.date())
json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
for column in json_columns:
    df1[column] = df1[column].apply(json.loads)
df1 = df1[["budget", "genres", "original_title", "popularity", "production_companies",
                      "release_date", "revenue", "runtime", "vote_average", "vote_count", "overview"]]
df1 = df1.reset_index(drop=True)

for index, row in df1.iterrows():
    g=[]
    for i in range(len(df1["genres"][index])):
        g.append(df1["genres"][index][i].get("name"))
    df1.at[index, 'genres'] =g
           
for index, row in df1.iterrows():
    c=[]
    for i in range(len(df1["production_companies"][index])):
        c.append(df1["production_companies"][index][i].get("name"))
    df1.at[index, 'production_companies'] =c
    
len(df1)

4803

In [8]:
df2 = pd.read_csv("2017_movie_info.csv")
df2['release_date'] = pd.to_datetime(df2['release_date']).apply(lambda x: x.date())
df2["genres"] = df2["genres"].apply(literal_eval)
df2["production_companies"] = df2["production_companies"].apply(literal_eval)
df2.drop('id', axis=1, inplace=True)
df2 = df2.reset_index(drop=True)
len(df2)


341

In [9]:
df = pd.concat([df1,df2],axis=0, join='outer',ignore_index=True)

#clean_df = df.loc[df["budget"] != 0]
#clean_df = df.loc[df["revenue"] != 0]
clean_df = df.loc[df["vote_count"]>100].reset_index()
#clean_df = df.loc[df["production_companies"] != []]

len(clean_df)

3490

In [10]:
#runs vader
compound_score =[]
for index, row in clean_df.iterrows():
    try:
        results = analyzer.polarity_scores(clean_df["overview"][index])
        comp = results["compound"]  
    except:
        comp=None
    compound_score.append(comp)
        
clean_df["compound_score"] = compound_score
clean_df.to_csv("movies_with_compound.csv", index=False, header=True)
clean_df.head(2)

Unnamed: 0,index,budget,genres,original_title,popularity,production_companies,release_date,revenue,runtime,vote_average,vote_count,overview,compound_score
0,0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",2009-12-10,2787965087,162.0,7.2,11800,"In the 22nd century, a paraplegic Marine is di...",-0.3612
1,1,300000000,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",2007-05-19,961000000,169.0,6.9,4500,"Captain Barbossa, long believed to be dead, ha...",-0.3919


In [11]:
#Unique genres list

unique_genres = []

for i in range(len(clean_df)):
     for j in range(len(clean_df["genres"][i])):
        if clean_df["genres"][i][j] not in unique_genres:
            unique_genres.append(clean_df["genres"][i][j])
unique_genres            

['Action',
 'Adventure',
 'Fantasy',
 'Science Fiction',
 'Crime',
 'Drama',
 'Thriller',
 'Animation',
 'Family',
 'Western',
 'Comedy',
 'Romance',
 'Horror',
 'Mystery',
 'History',
 'War',
 'Music',
 'Documentary',
 'TV Movie']

In [12]:
genre_dict={
    "Action":[],
    'Adventure':[],
     'Fantasy':[],
     'Science Fiction':[],
     'Crime':[],
     'Drama':[],
     'Thriller':[],
     'Animation':[],
     'Family':[],
     'Western':[],
     'Comedy':[],
     'Romance':[],
     'Horror':[],
     'Mystery':[],
     'History':[],
     'War':[],
     'Music':[],
     'Documentary':[],
     'TV Movie':[]
}

for index,row in clean_df.iterrows():
    glist = row["genres"]
    for i in glist:
        for j in range(len(unique_genres)):
            if i == unique_genres[j]:
                genre_dict[unique_genres[j]].append(row)
                
Action_df=pd.DataFrame(genre_dict["Action"])
Adventure_df=pd.DataFrame(genre_dict['Adventure'])
Fantasy_df=pd.DataFrame(genre_dict['Fantasy']),
Science_Fiction_df=pd.DataFrame(genre_dict['Science Fiction'])
Crime_df=pd.DataFrame(genre_dict['Crime'])
Drama_df=pd.DataFrame(genre_dict['Drama'])
Thriller_df=pd.DataFrame(genre_dict['Thriller'])
Animation_df=pd.DataFrame(genre_dict['Animation'])
Family_df=pd.DataFrame(genre_dict['Family'])
Western_df=pd.DataFrame(genre_dict['Western'])
Comedy_df=pd.DataFrame(genre_dict['Comedy'])
Romance_df=pd.DataFrame(genre_dict['Romance'])
Horror_df=pd.DataFrame(genre_dict['Horror'])
Mystery_df=pd.DataFrame(genre_dict['Mystery'])
History_df=pd.DataFrame(genre_dict['History'])
War_df=pd.DataFrame(genre_dict['War'])
Music_df=pd.DataFrame(genre_dict['Music'])
Documentary_df=pd.DataFrame(genre_dict['Documentary'])
TV_Movie_df=pd.DataFrame(genre_dict['TV Movie'])

In [15]:
year = []
month = []
for index,row in clean_df.iterrows():
    year.append(row["release_date"].year)
    month.append(row["release_date"].month)
clean_df['released_year'] = year
clean_df['released_month'] = month

clean_df.head(2)

Unnamed: 0,index,budget,genres,original_title,popularity,production_companies,release_date,revenue,runtime,vote_average,vote_count,overview,compound_score,released_year,released_month
0,0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",2009-12-10,2787965087,162.0,7.2,11800,"In the 22nd century, a paraplegic Marine is di...",-0.3612,2009,12
1,1,300000000,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",2007-05-19,961000000,169.0,6.9,4500,"Captain Barbossa, long believed to be dead, ha...",-0.3919,2007,5


In [16]:
year_bins = [1927,1930, 1940, 1950, 1960, 1970,1980,1990,2000,2010,2018]
group_names = ["1920s", "1930s", "1940s", "1950s","1960s","1970s","1980s","1990s","2000s","2010s"]
clean_df["decade"] = pd.cut(clean_df["released_year"], year_bins,labels=group_names)

In [18]:
clean_df.head(2)

Unnamed: 0,index,budget,genres,original_title,popularity,production_companies,release_date,revenue,runtime,vote_average,vote_count,overview,compound_score,released_year,released_month,decade
0,0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",Avatar,150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",2009-12-10,2787965087,162.0,7.2,11800,"In the 22nd century, a paraplegic Marine is di...",-0.3612,2009,12,2000s
1,1,300000000,"[Adventure, Fantasy, Action]",Pirates of the Caribbean: At World's End,139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",2007-05-19,961000000,169.0,6.9,4500,"Captain Barbossa, long believed to be dead, ha...",-0.3919,2007,5,2000s
