# Análise de Filmes da Disney - Baseado em lista do IMDB

### Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import re
import json
import requests
import seaborn as sns
%matplotlib inline
import ast
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import random


### Questões iniciais

O que faz um filme ter sucesso?
O que influencia na rentabilidade?
Qual a correlação entre diretores, personagens, investimento?
Análise de Sentimento dos scripts dos filmes mais famosos
Bag of Words
Gerador de frases
Sistema de recomendação - Quiz Buzzfeed - Relacionar com os parques

### Webscrapping

In [2]:
url = 'https://www.imdb.com/list/ls068561553/'

In [3]:
get_html1 = requests.get(url)

In [4]:
get_html1.status_code

200

In [5]:
html1 = get_html1.content

In [6]:
soup1 = BeautifulSoup(html1, "lxml")

In [7]:
lista1 = [i.get("href") for i in soup1.find_all('a') if str(i.get("href")).startswith("/title/")]


In [8]:
lista_links = lista1


In [9]:
for link2 in range(1, 6):
    url2 = 'https://www.imdb.com/list/ls068561553/' + '?sort=list_order,asc&st_dt=&mode=detail&page=' + str(link2)
    get_html2 = requests.get(url2).content
    soup2 = BeautifulSoup(get_html2, "lxml")
    lista2 = [i.get("href") for i in soup2.find_all('a') if str(i.get("href")).startswith("/title/")]
    lista_links+=lista2
len(lista_links)

1314

In [10]:
len(set(lista_links))

567

In [11]:
lista_links[:5]

['/title/tt0108052/?ref_=nv_mv_dflt_1',
 '/title/tt0108052/?ref_=nv_mv_dflt_2',
 '/title/tt0029583/',
 '/title/tt0029583/',
 '/title/tt0032910/']

In [12]:
#Criação do dataset principal

disney = pd.DataFrame()

In [14]:
for item in lista_links[2:]:
    link = "http://www.imdb.com" + item
    get_html = requests.get(link).content
    soup = BeautifulSoup(get_html, "lxml")
    
    #title = [item.get('content').split(' (')[0] for item in soup.find_all('meta') if type(item.get('content')) != None
                                                                #and str(item.get('content')).endswith('- IMDb')]
    title = None
    rating_imdb = None
    directors = None
    w = None
    s = None
    keywords = None
    g = None
    c = None
    rd = None
    budget1 = None
    worldwide_gross1 = None
    production = None
    runtime = None   
    
        
    try:
        title = [element.get('content').split(' (')[0] for element in soup.find_all("meta",  property="og:title")][0]
        #print(title, item)
    except:
        title = None
    
    try:
        rating_imdb = [i for i in soup.find_all('span', {'itemprop': 'ratingValue'})][0].text
    except:
        rating_imdb = None
        
    try:
        directors = [element.text.strip() for element in soup.find_all('div',"credit_summary_item") 
                 if "Directors:" in element.text][0].split("\n")[1].strip("|").split(",")
    except:
        directors = None
        
    
    
    try:
        writers = [element.text.replace("\n"," ") for element in soup.find_all('div',"credit_summary_item") 
                   if "Writers:" in element.text][0]
        w = re.findall('[A-Z]\w*\s[A-Z]\w*',writers)
    except:
        w = None
    
    
    try:
        stars = [element.text.replace("\n"," ") for element in soup.find_all('div',"credit_summary_item") 
                 if "Stars:" in element.text]
        s = re.findall('[A-Z]\w*\s[A-Z]\w*',stars[0])
    except:
        stars = None
    
    
    try:
        keywords = [i.text for i in soup.find_all('span', {'class': 'itemprop'})]
    except:
        keywords = None
    
    
    try:
        genres = [element.text.replace("\n"," ").replace("\xa0|"," ") for element in soup.find_all('div',
                                                "see-more inline canwrap") if "Genres:" in element.text]
        g = re.findall('[A-Z]\w*\s',genres[0])
        
    except:
        g = None
    
    try:
        certificate = [element.text for element in soup.find_all('div',"txt-block") if "Certificate:" in element.text]
        c = re.findall(r"\n(\w+)\n",certificate[0])[0]
    except:
        c = None
      
    
    try:
        release_date = [element.text for element in soup.find_all('div',"txt-block") if "Release Date:" in element.text]
        rd = re.findall(r"\nRelease\sDate: (\d*\s\w*\s\d*)",release_date[0])[0]
    except:
        rd = None
    
    
    try:
        budget = [element.text for element in soup.find_all('div',"txt-block") if "Budget:" in element.text]
        budget1 = re.findall('\d{1,}(?:,\d{3})+', budget[0])[0]   
    except:
        budget1 = 0
    
    
    try:
        worldwide_gross = [element.text for element in soup.find_all('div',"txt-block") 
                       if "Cumulative Worldwide Gross:" in element.text]
        worldwide_gross1 = re.findall('\d{1,}(?:,\d{3})+', worldwide_gross[0])[0]
        
    except:
        try:
            worldwide_gross1 = [element.text for element in soup.find_all('div',"txt-block")
                               if "Gross USA:" in element.text]
            worldwide_gross1 = re.findall('\d{1,}(?:,\d{3})+', worldwide_gross1[0])[0]
            
        except:
            worldwide_gross1 = 0
    
    
    try:
        production = re.findall(r'\n (.+) \n', [element.text for element in soup.find_all('div',"txt-block") 
                                            if "Production Co:" in element.text][0])[0]
    except:
        production = None
        
    try:
        runtime = [i.text for i in soup.find_all('time')][1].strip(" min")
    except:
        runtime = 0
    
    
    df_disney = pd.DataFrame({"Movie_Title":[title],"IMDB_Rating": [rating_imdb], "Directors": [directors], 
                   "Writers" : [w],"Stars":[s], "Keywords" : [keywords],
                   "Genres":[g],"Certificate":[c], "Release_Date":[rd],
                  "Budget":[budget1],"Worldwide_Gross":[worldwide_gross1], "Production":[production],"Runtime":[runtime]})
    
    disney = disney.append(df_disney)

In [14]:
len(list(lista1[2:]))

1312

In [15]:
disney.dtypes

Series([], dtype: object)

In [16]:
disney.head()

In [17]:
#disney.to_csv("DISNEYCORRIGIDO.csv")

In [101]:
#Segundo dataset

imdb = pd.read_csv("IMDBCORRIGIDO.csv", sep=',', encoding = 'unicode_escape')

In [102]:
imdb.head()

Unnamed: 0.1,Unnamed: 0,Movie_Title,Title_Type,Year,Genres,Num Votes,Directors
0,0,Snow White and the Seven Dwarfs,movie,1937.0,"Animation, Family, Fantasy, Musical, Romance",168528.0,"William Cottrell, David Hand, Ben Sharpsteen, ..."
1,1,Pinocchio,movie,1940.0,"Animation, Comedy, Family, Fantasy, Musical",119919.0,"Ben Sharpsteen, Jack Kinney, Hamilton Luske, W..."
2,2,Fantasia,movie,1940.0,"Animation, Family, Fantasy, Music",83162.0,"Samuel Armstrong, Ben Sharpsteen, Hamilton Lus..."
3,3,The Reluctant Dragon,movie,1941.0,"Animation, Comedy, Family",2203.0,"Jack Kinney, Alfred L. Werker, Hamilton Luske,..."
4,4,Dumbo,movie,1941.0,"Animation, Drama, Family, Musical",111062.0,"Samuel Armstrong, Ben Sharpsteen, Jack Kinney,..."


In [103]:
disney = pd.read_csv("C:/Users/marim/Documents/IRONHACK/PROJETO FINAL/DISNEYCORRIGIDO.csv")

In [104]:
#Dataset criado a partir do webscrapping e do dataset importado direto na página do IMBD, com o intuito de remover links que 
#não fizessem parte da lista

data_disney = pd.merge(imdb, disney, left_on ='Movie_Title', right_on='Movie_Title', how = 'inner')

In [105]:
data_disney.head()

Unnamed: 0,Unnamed: 0_x,Movie_Title,Title_Type,Year,Genres_x,Num Votes,Directors_x,Unnamed: 0_y,IMDB_Rating,Directors_y,Writers,Stars,Keywords,Genres_y,Certificate,Release_Date,Budget,Worldwide_Gross,Production,Runtime
0,0,Snow White and the Seven Dwarfs,movie,1937.0,"Animation, Family, Fantasy, Musical, Romance",168528.0,"William Cottrell, David Hand, Ben Sharpsteen, ...",0,7.6,"['William Cottrell', ' David Hand ']","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...","['Animation ', 'Family ', 'Fantasy ', 'Musical...",Livre,4 February 1938,1499000,184925486,Walt Disney Productions,83
1,0,Snow White and the Seven Dwarfs,movie,1937.0,"Animation, Family, Fantasy, Musical, Romance",168528.0,"William Cottrell, David Hand, Ben Sharpsteen, ...",0,7.6,"['William Cottrell', ' David Hand ']","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...","['Animation ', 'Family ', 'Fantasy ', 'Musical...",Livre,4 February 1938,1499000,184925486,Walt Disney Productions,83
2,0,Snow White and the Seven Dwarfs,movie,1937.0,"Animation, Family, Fantasy, Musical, Romance",168528.0,"William Cottrell, David Hand, Ben Sharpsteen, ...",0,7.6,"['William Cottrell', ' David Hand ']","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...","['Animation ', 'Family ', 'Fantasy ', 'Musical...",Livre,4 February 1938,1499000,184925486,Walt Disney Productions,83
3,0,Snow White and the Seven Dwarfs,movie,1937.0,"Animation, Family, Fantasy, Musical, Romance",168528.0,"William Cottrell, David Hand, Ben Sharpsteen, ...",0,7.6,"['William Cottrell', ' David Hand ']","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...","['Animation ', 'Family ', 'Fantasy ', 'Musical...",Livre,4 February 1938,1499000,184925486,Walt Disney Productions,83
4,0,Snow White and the Seven Dwarfs,movie,1937.0,"Animation, Family, Fantasy, Musical, Romance",168528.0,"William Cottrell, David Hand, Ben Sharpsteen, ...",0,7.6,"['William Cottrell', ' David Hand ']","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...","['Animation ', 'Family ', 'Fantasy ', 'Musical...",Livre,4 February 1938,1499000,184925486,Walt Disney Productions,83


In [106]:
data_disney.columns

Index(['Unnamed: 0_x', 'Movie_Title', 'Title_Type', 'Year', 'Genres_x',
       'Num Votes', 'Directors_x', 'Unnamed: 0_y', 'IMDB_Rating',
       'Directors_y', 'Writers', 'Stars', 'Keywords', 'Genres_y',
       'Certificate', 'Release_Date', 'Budget', 'Worldwide_Gross',
       'Production', 'Runtime'],
      dtype='object')

In [107]:
#Removendo colunas duplicadas e desnecessárias

data_disney = data_disney.drop(columns = ['Unnamed: 0_x', 'Unnamed: 0_y', 'Genres_y', 'Directors_y'])

In [108]:
data_disney.columns

Index(['Movie_Title', 'Title_Type', 'Year', 'Genres_x', 'Num Votes',
       'Directors_x', 'IMDB_Rating', 'Writers', 'Stars', 'Keywords',
       'Certificate', 'Release_Date', 'Budget', 'Worldwide_Gross',
       'Production', 'Runtime'],
      dtype='object')

In [109]:
data_disney.columns = ['Movie_Title', 'Title_Type', 'Year', 'Genres', 'Num Votes', 'Directors',
               'IMDB_Rating', 'Writers', 'Stars', 'Keywords', 'Certificate',
               'Release_Date', 'Budget', 'Worldwide_Gross', 'Production', 'Runtime']

In [110]:
column_order = ['Movie_Title', 'Release_Date', 'Year', 'Certificate', 'Genres', 'Budget', 'Worldwide_Gross', 'Production',
               'Directors', 'Writers', 'Stars', 'Keywords', 'Runtime', 'IMDB_Rating', 'Num Votes' , 'Title_Type']

data_disney = data_disney[column_order]


In [111]:
data_disney.head()

Unnamed: 0,Movie_Title,Release_Date,Year,Certificate,Genres,Budget,Worldwide_Gross,Production,Directors,Writers,Stars,Keywords,Runtime,IMDB_Rating,Num Votes,Title_Type
0,Snow White and the Seven Dwarfs,4 February 1938,1937.0,Livre,"Animation, Family, Fantasy, Musical, Romance",1499000,184925486,Walt Disney Productions,"William Cottrell, David Hand, Ben Sharpsteen, ...","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...",83,7.6,168528.0,movie
1,Snow White and the Seven Dwarfs,4 February 1938,1937.0,Livre,"Animation, Family, Fantasy, Musical, Romance",1499000,184925486,Walt Disney Productions,"William Cottrell, David Hand, Ben Sharpsteen, ...","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...",83,7.6,168528.0,movie
2,Snow White and the Seven Dwarfs,4 February 1938,1937.0,Livre,"Animation, Family, Fantasy, Musical, Romance",1499000,184925486,Walt Disney Productions,"William Cottrell, David Hand, Ben Sharpsteen, ...","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...",83,7.6,168528.0,movie
3,Snow White and the Seven Dwarfs,4 February 1938,1937.0,Livre,"Animation, Family, Fantasy, Musical, Romance",1499000,184925486,Walt Disney Productions,"William Cottrell, David Hand, Ben Sharpsteen, ...","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...",83,7.6,168528.0,movie
4,Snow White and the Seven Dwarfs,4 February 1938,1937.0,Livre,"Animation, Family, Fantasy, Musical, Romance",1499000,184925486,Walt Disney Productions,"William Cottrell, David Hand, Ben Sharpsteen, ...","['Jacob Grimm', 'Wilhelm Grimm']","['Adriana Caselotti', 'Harry Stockwell', 'Luci...","['dwarf', 'snow white and the seven dwarfs', '...",83,7.6,168528.0,movie


In [112]:
data_disney.shape

(2008, 16)

In [113]:
data_disney = data_disney.sort_values("Movie_Title", ascending = True)

In [114]:
data_disney.head()

Unnamed: 0,Movie_Title,Release_Date,Year,Certificate,Genres,Budget,Worldwide_Gross,Production,Directors,Writers,Stars,Keywords,Runtime,IMDB_Rating,Num Votes,Title_Type
1636,'Twas the Night,7 December 2001,2001.0,,"Comedy, Family",0,0,"'Twas Productions, Adam Productions, Disney Ch...",Nick Castle,"['Jim Lincoln', 'Dan Studney']","['Josh Zuckerman', 'Brenda Grate', 'Bryan Cran...","['joyride', 'santa', 'uncle', 'boy', '14 year ...",0,5.5,692.0,tvMovie
1637,'Twas the Night,7 December 2001,2001.0,,"Comedy, Family",0,0,"'Twas Productions, Adam Productions, Disney Ch...",Nick Castle,"['Jim Lincoln', 'Dan Studney']","['Josh Zuckerman', 'Brenda Grate', 'Bryan Cran...","['joyride', 'santa', 'uncle', 'boy', '14 year ...",0,5.5,692.0,tvMovie
1481,101 Dalmatians,14 December 1996,1996.0,Livre,"Adventure, Comedy, Crime, Family",75000000,320689294,"Walt Disney Pictures, Wizzer Productions, Grea...",Stephen Herek,"['Dodie Smith', 'John Hughes']","['Glenn Close', 'Jeff Daniels', 'Joely Richard...","['dalmatian', 'live action remake', 'dog actor...",103,5.7,94719.0,movie
1483,101 Dalmatians,14 December 1996,1996.0,Livre,"Adventure, Comedy, Crime, Family",75000000,320689294,"Walt Disney Pictures, Wizzer Productions, Grea...",Stephen Herek,"['Dodie Smith', 'John Hughes']","['Glenn Close', 'Jeff Daniels', 'Joely Richard...","['dalmatian', 'live action remake', 'dog actor...",103,5.7,94719.0,movie
1482,101 Dalmatians,14 December 1996,1996.0,Livre,"Adventure, Comedy, Crime, Family",75000000,320689294,"Walt Disney Pictures, Wizzer Productions, Grea...",Stephen Herek,"['Dodie Smith', 'John Hughes']","['Glenn Close', 'Jeff Daniels', 'Joely Richard...","['dalmatian', 'live action remake', 'dog actor...",103,5.7,94719.0,movie


In [115]:
#Removendo valores duplicados

filtro3 = ['Movie_Title', 'Release_Date']



data_disney = data_disney.drop_duplicates(filtro3)

In [116]:
#Arrumando o formato de data da coluna

from datetime import datetime

date_str1 = '4 February 1938'

def date(x):
    try:
        date1 = datetime.strptime(x, '%d %B %Y').date()
        return date1
    except:
        return 0


data_disney['Release_Date'] = data_disney['Release_Date'].apply(date)

In [117]:
#trocar o Livre em Certificate por "G"
#arrumar Writers, Stars, Keywords
#tirar as virgulas de Budget e Gross
#transformar os tipos de colunas
#transformar ano em inteiro

In [118]:
data_disney['Budget'] = data_disney['Budget'].str.replace(',', '')

In [119]:
data_disney['Worldwide_Gross'] = data_disney['Worldwide_Gross'].str.replace(',', '')

In [120]:
data_disney['Certificate'] = data_disney['Certificate'].str.replace('Livre', 'G')

In [121]:
data_disney['Budget'] = data_disney['Budget'].fillna(0)

In [122]:
data_disney['Budget'] = data_disney['Budget'].astype('int')

In [123]:
data_disney['Worldwide_Gross'] = data_disney['Worldwide_Gross'].fillna(None)

ValueError: Must specify a fill 'value' or 'method'.

In [124]:
data_disney['Worldwide_Gross'] = data_disney['Worldwide_Gross'].astype('int')


In [125]:
data_disney['Runtime'] = pd.to_numeric(data_disney['Runtime'])


In [126]:
data_disney['IMDB_Rating'] = pd.to_numeric(data_disney['IMDB_Rating'])


In [127]:
data_disney['Year'] = data_disney['Year'].astype('int')

In [128]:
data_disney['Directors'] = data_disney['Directors'].str.split(',')

In [129]:
data_disney["Directors"].fillna("No Information",inplace=True)

In [130]:
data_disney["Writers"].fillna("No Information",inplace=True)

In [131]:
data_disney["Stars"].fillna("No Information",inplace=True)

In [132]:
data_disney['Genres'] = data_disney['Genres'].str.split(',')

In [133]:
data_disney['Production'] = data_disney['Production'].str.split(',')

In [134]:
#Tratando os valores nulos

for column in data_disney.columns:
    data_disney[column] = data_disney[column].apply(lambda x: None if x == 0 else x)

In [135]:
data_disney.isnull().sum()

Movie_Title          0
Release_Date       187
Year                 0
Certificate        289
Genres               0
Budget             366
Worldwide_Gross    318
Production          54
Directors            0
Writers              0
Stars                0
Keywords             0
Runtime            126
IMDB_Rating         68
Num Votes            0
Title_Type           0
dtype: int64

In [136]:
#Verificando os valores nulos da coluna que será utilizada para medir rentabilidade

data_disney[data_disney["Worldwide_Gross"].isnull()]

Unnamed: 0,Movie_Title,Release_Date,Year,Certificate,Genres,Budget,Worldwide_Gross,Production,Directors,Writers,Stars,Keywords,Runtime,IMDB_Rating,Num Votes,Title_Type
1636,'Twas the Night,2001-12-07,2001,,"[Comedy, Family]",,,"['Twas Productions, Adam Productions, Disney...",[Nick Castle],"['Jim Lincoln', 'Dan Studney']","['Josh Zuckerman', 'Brenda Grate', 'Bryan Cran...","['joyride', 'santa', 'uncle', 'boy', '14 year ...",,5.5,692.0,tvMovie
193,"20,000 Leagues Under the Sea",1956-01-01,1954,,"[Adventure, Drama, Family, Fantasy, Sci-Fi]",9000000.0,,[Walt Disney Productions],[Richard Fleischer],No Information,"['Kirk Douglas', 'James Mason', 'Paul Lukas']","['submarine', 'giant tentacle', 'submarine cap...",127.0,7.2,26583.0,movie
1660,A Ring of Endless Light,2002-08-23,2002,,"[Drama, Family, Romance]",,,[Disney Channel],[Greg Beeman],"['Madeleine L', 'Marita Giovanni']","['Mischa Barton', 'Ryan Merriman', 'Jared Pada...","['leukemia', 'dying', 'death', 'dolphin', 'yac...",88.0,6.1,1000.0,tvMovie
1661,A Ring of Endless Light,,2002,,"[Drama, Family, Romance]",,,,[Greg Beeman],No Information,No Information,[],,,1000.0,tvMovie
1126,A Tale of Two Critters,1977-06-22,1977,,"[Adventure, Family]",,,[Walt Disney Productions],[Jack Speirs],No Information,No Information,[],48.0,7.0,64.0,movie
563,A Tiger Walks,,1964,,"[Family, Drama]",,,[Walt Disney Productions],[Norman Tokar],"['Ian Niall', 'Lowell S']","['Brian Keith', 'Vera Miles', 'Pamela Franklin']","['tiger', 'circus', 'escaped animal', 'animal ...",91.0,6.5,246.0,movie
560,A Tiger Walks,,1964,0,"[Family, Drama]",,,[Walt Disney Productions],[Norman Tokar],"['Ian Niall', 'Lowell S']","['Brian Keith', 'Vera Miles', 'Pamela Franklin']","['tiger', 'circus', 'escaped animal', 'animal ...",91.0,6.5,246.0,movie
1673,Air Bud: Seventh Inning Fetch,2002-06-18,2002,G,"[Comedy, Drama, Family, Sport]",,,"[Fetch Boy Films Ltd., International Keystone...",[Robert Vince],"['Kevin DiCicco', 'Air Bud', 'Robert Vince']","['Jeffrey Ballard', 'Jay Brazeau', 'Jason Bryd...","['2000s', 'baseball', 'animal character name i...",93.0,4.5,1843.0,video
1706,Air Bud: Spikes Back,2003-07-01,2003,G,"[Comedy, Family, Sport]",5000000.0,,"[Keystone Entertainment, Keystone Pictures, ...",[Mike Southon],"['Kevin DiCicco', 'Air Bud', 'Robert Vince']","['Darren J', 'Christopher Bishop', 'Tyler Bois...","['2000s', 'golden retriever', 'dog movie', 'vo...",,4.1,1637.0,video
1818,Air Buddies,2006-12-12,2006,,"[Adventure, Comedy, Family]",9000000.0,,"[Keystone Family Pictures, Key Pix Productions]",[Robert Vince],"['Robert Vince', 'Anna McRoberts']","['Cascy Beddow', 'Jane Carr', 'Kelly Chapek']","['skunk', 'wolf', 'goat', 'pig', '2000s']",80.0,4.8,3061.0,video


In [137]:
#Coluna com muitos valores nulos e baixa relevância

data_disney = data_disney.drop(columns = ['Certificate'])

In [138]:
data_disney.dtypes

Movie_Title         object
Release_Date        object
Year                 int64
Genres              object
Budget             float64
Worldwide_Gross    float64
Production          object
Directors           object
Writers             object
Stars               object
Keywords            object
Runtime            float64
IMDB_Rating        float64
Num Votes          float64
Title_Type          object
dtype: object

In [139]:
data_disney.to_csv("SUCESSO.csv")

In [140]:
#Removendo os valores nulos da coluna númerica principal para analisar sucesso do filme

data_disney = data_disney.loc[~data_disney['IMDB_Rating'].isna()]

In [141]:
# Criando Dummies para a coluna de diretores para montar um novo dataframe de análise

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

data_diretores = pd.DataFrame(mlb.fit_transform(data_disney['Directors']),columns=mlb.classes_, index=data_disney.index)


In [142]:
data_diretores.drop(["N", "I", "n","i","f","o","r","m","a","t"],axis=1,inplace=True)

In [143]:
data_diretores = data_diretores.join(data_disney["IMDB_Rating"])

In [144]:
data_diretores.head()

Unnamed: 0,Unnamed: 1,Alex Mann,Alfred L. Werker,Arthur Davis,Ash Brannon,Barry Cook,Ben Sharpsteen,Bill Roberts,Bill Speers,Bob Peterson,...,Walter Murch,Wilfred Jackson,Will Finn,William Beaudine,William Cottrell,William Dear,Winston Hibler,Wolfgang Reitherman,Xavier Koller,IMDB_Rating
1636,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.5
1481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.7
1614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.9
193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.2
1542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.2


In [145]:
#data_diretores.to_csv("DIRETORES.csv")

In [146]:
mlb = MultiLabelBinarizer()

data_escritores = pd.DataFrame(mlb.fit_transform(data_disney['Writers']),columns=mlb.classes_, index=data_disney.index)

In [147]:
data_escritores = data_escritores.join(data_disney["IMDB_Rating"])

In [148]:
data_escritores.drop(["N", "I", "n","i","f","o","r","m","a","t"],axis=1,inplace=True)

In [149]:
data_escritores.head()

Unnamed: 0,Unnamed: 1,',",",0,A,B,C,D,E,F,...,v,w,x,y,z,ä,é,ô,ü,IMDB_Rating
1636,1,1,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,5.5
1481,1,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,5.7
1614,1,1,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,4.9
193,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7.2
1542,1,1,1,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,7.2


In [150]:
#data_escritores.to_csv("ESCRITORES.csv")

In [151]:
mlb = MultiLabelBinarizer()

data_estrelas = pd.DataFrame(mlb.fit_transform(data_disney['Stars']),columns=mlb.classes_, index=data_disney.index)

In [152]:
data_estrelas = data_estrelas.join(data_disney["IMDB_Rating"])

In [153]:
data_estrelas.drop(["N", "I", "n","i","f","o","r","m","a","t"],axis=1,inplace=True)

In [154]:
data_estrelas.head()

Unnamed: 0,Unnamed: 1,',",",A,B,C,D,E,F,G,...,x,y,z,á,è,é,ë,í,ü,IMDB_Rating
1636,1,1,1,0,1,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,5.5
1481,1,1,1,0,0,1,1,0,0,1,...,0,1,0,0,0,0,0,0,0,5.7
1614,1,1,1,0,0,1,1,0,0,1,...,0,0,0,0,0,1,0,0,0,4.9
193,1,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,7.2
1542,1,1,1,0,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,7.2


In [67]:
#data_estrelas.to_csv("ESTRELAS.csv")

In [68]:
mlb = MultiLabelBinarizer()

data_keywords = pd.DataFrame(mlb.fit_transform(data_disney['Keywords']),columns=mlb.classes_, index=data_disney.index)

In [69]:
data_keywords = data_keywords.join(data_disney["IMDB_Rating"])

In [70]:
#Keywords - BOG

In [71]:
#data_keywords.to_csv("KEYWORDS.csv")

In [72]:
#data_disney = data_disney.loc[(~data_disney['IMDB_Rating'].isna()) & ()]


In [73]:
data_rentabilidade = data_disney


In [74]:
data_rentabilidade = data_rentabilidade.loc[~data_disney['Worldwide_Gross'].isna()]

In [75]:
data_rentabilidade

Unnamed: 0,Movie_Title,Release_Date,Year,Genres,Budget,Worldwide_Gross,Production,Directors,Writers,Stars,Keywords,Runtime,IMDB_Rating,Num Votes,Title_Type
1481,101 Dalmatians,1996-12-14,1996,"[Adventure, Comedy, Crime, Family]",75000000.0,3.206893e+08,"[Walt Disney Pictures, Wizzer Productions, G...",[Stephen Herek],"['Dodie Smith', 'John Hughes']","['Glenn Close', 'Jeff Daniels', 'Joely Richard...","['dalmatian', 'live action remake', 'dog actor...",103.0,5.7,94719.0,movie
1614,102 Dalmatians,2000-12-01,2000,"[Adventure, Comedy, Family]",85000000.0,1.836118e+08,"[Cruella Productions, Kanzaman S.A.M., Walt ...",[Kevin Lima],"['Dodie Smith', 'Kristen Buckley']","['Glenn Close', 'Gérard Depardieu', 'Ioan Gruf...","['cruella de vil character', 'scheme', 'plot',...",100.0,4.9,32590.0,movie
1542,A Bug's Life,1998-12-18,1998,"[Animation, Adventure, Comedy, Family]",120000000.0,3.632589e+08,"[Pixar Animation Studios, Walt Disney Pictures]","[Andrew Stanton, John Lasseter]","['John Lasseter', 'Andrew Stanton']","['Kevin Spacey', 'Dave Foley', 'Julia Louis']","['ant', 'circus', 'anthropomorphic insect', 'g...",95.0,7.2,251705.0,movie
1908,A Christmas Carol,2009-11-06,2009,"[Animation, Drama, Family, Fantasy]",200000000.0,3.252866e+08,"[Walt Disney Pictures, ImageMovers Digital, ...",[Robert Zemeckis],"['Robert Zemeckis', 'Charles Dickens']","['Jim Carrey', 'Gary Oldman', 'Colin Firth']","['scrooge', 'christmas', 'ghost', 'christmas e...",96.0,6.8,93462.0,movie
1372,A Far Off Place,1993-07-23,1993,"[Adventure, Drama, Family, Romance]",,1.289075e+07,"[Walt Disney Pictures, Amblin Entertainment, ...",[Mikael Salomon],['Robert Caswell'],"['Reese Witherspoon', 'Ethan Embry', 'Jack Tho...","['farm', 'gamekeeper', 'poacher', 'kalahari', ...",100.0,6.6,3183.0,movie
1462,A Goofy Movie,1996-04-05,1995,"[Animation, Adventure, Comedy, Family, Mus...",,3.534860e+07,"[Walt Disney Pictures, Disney Television Anim...",[Kevin Lima],"['Jymn Magon', 'Jymm Magon', 'Jymn Magon', 'Jy...","['Bill Farmer', 'Jason Marsden', 'Jim Cummings']","['father son relationship', 'older actors youn...",78.0,6.8,41838.0,movie
1451,A Kid in King Arthur's Court,1996-02-09,1995,"[Adventure, Comedy, Family, Fantasy, Romance]",15000000.0,1.340672e+07,"[Tapestry Films, Trimark Pictures, Walt Disn...",[Michael Gottlieb],"['Michael Part', 'Robert L']","['Thomas Ian', 'Joss Ackland', 'Art Malik']","['confidence', 'camelot', 'jousting', 'basebal...",89.0,4.7,5388.0,movie
1520,Air Bud,1998-02-20,1997,"[Comedy, Drama, Family, Sport]",3000000.0,2.314450e+07,"[Walt Disney Pictures, Keystone Pictures]",[Charles Martin Smith],"['Kevin DiCicco', 'Air Bud', 'Paul Tamasy']","['Michael Jeter', 'Kevin Zegers', 'Wendy Makke...","['basketball', 'character name in title', 'str...",98.0,5.2,15475.0,movie
1532,Air Bud: Golden Receiver,1998-08-14,1998,"[Comedy, Family, Sport]",,1.022412e+07,"[Keystone Pictures, Dimension Films, Allianc...",[Richard Martin],"['Kevin DiCicco', 'Air Bud', 'Paul Tamasy']","['Kevin Zegers', 'Cynthia Stevenson', 'Tim Con...","['russian', 'golden retriever', 'dog', 'dog mo...",90.0,4.5,5074.0,movie
1354,Aladdin,1993-07-03,1992,"[Animation, Adventure, Comedy, Family, Fan...",28000000.0,5.040502e+08,"[Walt Disney Pictures, Body Penis Productions...","[Ron Clements, John Musker]","['Ron Clements', 'John Musker']","['Scott Weinger', 'Robin Williams', 'Linda Lar...","['arab', 'genie', 'genie character', 'princess...",90.0,8.0,337270.0,movie


In [76]:
data_rentabilidade.isna().sum()

Movie_Title         0
Release_Date       37
Year                0
Genres              0
Budget             84
Worldwide_Gross     0
Production          0
Directors           0
Writers             0
Stars               0
Keywords            0
Runtime            16
IMDB_Rating         0
Num Votes           0
Title_Type          0
dtype: int64

In [77]:
data_rentabilidade['Worldwide_Gross'] = data_rentabilidade['Worldwide_Gross'].astype('int')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [78]:
data_rentabilidade.to_csv("RENTABILIDADE.csv")

In [79]:
# Criando Dummies para a coluna de diretores para montar um novo dataframe de análise - RENTABILIDADE

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

diretores_rent = pd.DataFrame(mlb.fit_transform(data_rentabilidade['Directors']),columns=mlb.classes_, index=data_rentabilidade.index)

In [80]:
diretores_rent = diretores_rent.join(data_rentabilidade['Worldwide_Gross'])

In [86]:
diretores_rent.head()

Unnamed: 0,Alfred L. Werker,Arthur Davis,Ash Brannon,Barry Cook,Ben Sharpsteen,Bill Roberts,Bob Peterson,Brenda Chapman,Brian Henson,Chris Buck,...,Walt Becker,Walter Murch,Wilfred Jackson,Will Finn,William Beaudine,William Cottrell,William Dear,Wolfgang Reitherman,Xavier Koller,Worldwide_Gross
1481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,320689294
1614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,183611771
1542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,363258859
1908,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,325286646
1372,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12890752


In [81]:
estrelas_rent = pd.DataFrame(mlb.fit_transform(data_rentabilidade['Stars']),columns=mlb.classes_, index=data_rentabilidade.index)

In [87]:
estrelas_rent = estrelas_rent.join(data_rentabilidade['Worldwide_Gross'])

In [89]:
escritores_rent = pd.DataFrame(mlb.fit_transform(data_rentabilidade['Writers']),columns=mlb.classes_, index=data_rentabilidade.index)

In [90]:
escritores_rent = escritores_rent.join(data_rentabilidade['Worldwide_Gross'])

In [91]:
genero_rent = pd.DataFrame(mlb.fit_transform(data_rentabilidade['Genres']),columns=mlb.classes_, index=data_rentabilidade.index)

In [92]:
genero_rent = genero_rent.join(data_rentabilidade['Worldwide_Gross'])

In [93]:
diretores_rent.to_csv("diretores_rent.csv")

In [94]:
estrelas_rent.to_csv("estrelas_rent.csv")

In [95]:
escritores_rent.to_csv("escritores_rent.csv")

In [96]:
genero_rent.to_csv("genero_rent.csv")