In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import csv
from sqlalchemy import create_engine
import cpi



In [2]:
# read and create dataframe
movie_load = 'movie_data.csv'
movie_df = pd.read_csv(movie_load, encoding= 'utf8')
movie_df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [3]:
#rename columns and repositions
movie_df = movie_df.rename(columns={'director_name': 'Director','gross': 'Gross_Income',
                                    'movie_title': 'Movie Title','content_rating': 'Content Rating', 
                                    'budget': 'Budget', 'title_year': 'Year','imdb_score': 'IMDB Score'})

In [4]:
#Formating Floats to Intergers
movie_df = movie_df.dropna()
movie_df.reset_index(drop=True, inplace=True)
format_mapping= {'Gross_Income', 'Budget','Year'}
for key in format_mapping:
    movie_df[key] = movie_df[key].astype(int)

In [5]:
#adjusting for inflation
movie_df['Gross_Income_ADJ'] = movie_df.apply(lambda x: cpi.inflate(x.Gross_Income, x.Year), axis=1)
movie_df['Budget_ADJ'] = movie_df.apply(lambda x: cpi.inflate(x.Budget, x.Year), axis=1)

In [6]:
movie_df = movie_df[['Movie Title','Year', 'Content Rating','Budget_ADJ',
                             'Gross_Income_ADJ', 'Director','IMDB Score']]
movie_df.head()

Unnamed: 0,Movie Title,Year,Content Rating,Budget_ADJ,Gross_Income_ADJ,Director,IMDB Score
0,Avatar,2009,PG-13,270785200.0,868918600.0,James Cameron,7.9
1,Pirates of the Caribbean: At World's End,2007,PG-13,354660400.0,365778000.0,Gore Verbinski,7.1
2,Spectre,2015,PG-13,253375900.0,206914200.0,Sam Mendes,6.8
3,The Dark Knight Rises,2012,PG-13,266905900.0,478434900.0,Christopher Nolan,8.5
4,John Carter,2012,PG-13,281532400.0,77999180.0,Andrew Stanton,6.6


In [7]:
#non-formated csv
movie_df.to_csv("Analysis.csv")
movie_df.columns

Index(['Movie Title', 'Year', 'Content Rating', 'Budget_ADJ',
       'Gross_Income_ADJ', 'Director', 'IMDB Score'],
      dtype='object')

In [8]:
#Convert to $
format_mapping= {'Gross_Income_ADJ':'${:,.2f}','Budget_ADJ': '${:,.2f}','Year': '{:.0f}'}
for key, value in format_mapping.items():
    movie_df[key] = movie_df[key].apply(value.format)

In [9]:
movie_df.to_csv("Clean_Data.csv")