In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns',None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.corpus import stopwords  
stop_words = set(stopwords.words('english'))
import os


import plotly as py
#import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image

In [None]:
train = pd.read_csv("../input/tmdb-box-office-prediction/train.csv")
test = pd.read_csv("../input/tmdb-box-office-prediction/test.csv")

In [None]:
test.head()

# Data Viz With seaborn

## Visualizing the target distribution

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
plt.subplot(1,2,1);
sns.distplot(train['revenue'], kde=False);
plt.title('Distribution of revenue');
plt.subplot(1,2,2);
sns.distplot(np.log1p(train['revenue']), kde=False);
plt.title('Distribution of log revenue');

In [None]:
train['log_revenue'] = np.log1p(train['revenue'])


## Anayzing relationship between budget and revenue

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
plt.subplot(1,2,1);
sns.scatterplot(train['budget'], train['revenue']);
plt.title('Revenue vs Budget');
plt.subplot(1,2,2);
sns.scatterplot(np.log1p(train['budget']), train['log_revenue']);
plt.title(' Log revenue vs log budget');

In [None]:
train['log_budget'] = np.log1p(train['budget'])
test['log_budget'] = np.log1p(test['budget'])

## effect of homepage on revenue

In [None]:
train['homepage'].value_counts().head(10)

In [None]:
train['has_homepage'] = 0
train.loc[train['homepage'].isnull() == False, 'has_homepage'] = 1

In [None]:
train['has_homepage'] = 0
train.loc[train['homepage'].isnull() == False, 'has_homepage'] = 1

In [None]:
sns.catplot(x = 'has_homepage', y = 'revenue', data=train);
plt.title('Revenue for films with and without a homepage')

## Distribution of languages in films

##### language data of top 10 languages

In [None]:
language_data = train.loc[train['original_language'].isin(train['original_language'].value_counts().head(10).index)]

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
plt.subplot(1,2,1);
sns.boxplot(x='original_language', y='revenue',data=language_data );
plt.title('Mean Revenue Per Language');
plt.subplot(1,2,2);
sns.boxplot(x='original_language', y='log_revenue',data=language_data );
plt.title(' mean log revenue per language');

## Frequent Keywords in movies

In [None]:
plt.figure(figsize=(12,12))
text = ' '.join(train['original_title'].values)
wordcloud = WordCloud(max_font_size=None,
                     background_color='white',
                     width=1200,height=1000).generate(text)

plt.imshow(wordcloud)
plt.title('Top words across movie titles')
plt.axis('off')
plt.show()

## Word cloud for description of movies

In [None]:
plt.figure(figsize=(12,12))
text = ' '.join(train['overview'].fillna('').values)
wordcloud = WordCloud(max_font_size=None,
                     background_color='white',
                     width=1200,height=1000).generate(text)

plt.imshow(wordcloud)
plt.title('Top words across movie overview')
plt.axis('off')
plt.show()

## words in description in top revenue movies

In [None]:
import eli5
from sklearn.linear_model import LinearRegression

In [None]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1,2),
    min_df=5
)

overview_text = vectorizer.fit_transform(train['overview'].fillna(''))
linreg = LinearRegression()
linreg.fit(overview_text,train['log_revenue'])
eli5.show_weights(linreg, vec=vectorizer, top=20, feature_filter=lambda x:x!='<BIAS>')

# Analyzing with plotly

## Analyzing movie release dates

In [None]:
test.loc[test['release_date'].isnull() == False, 'release_date'].head()

#### lets fix the dates

In [None]:
def fix_date(x):
    year = x.split('/')[2]
    if int(year) <=19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year

In [None]:
test.loc[test['release_date'].isnull() == True].head()

##### get the release value 

In [None]:
test.loc[test['release_date'].isnull() == True, 'release_date'] = '05/01/00'

In [None]:
train['release_date'] = train['release_date'].apply(lambda x: fix_date(x))
test['release_date'] = test['release_date'].apply(lambda x: fix_date(x))

## create features based on release date

In [None]:
train['release_date'] = pd.to_datetime(train['release_date'])
test['release_date'] = pd.to_datetime(test['release_date'])

In [None]:
def process_date(df):
    date_parts = ['year','weekday','month','weekofyear','day','quarter']
    for part in date_parts:
        part_col = 'release_date' + '_' + part
        df[part_col] = getattr(df['release_date'].dt, part).astype(int)
    return df

train = process_date(train)
test = process_date(test)

## using Plotly to vizualize the datasets

In [None]:
d1 = train['release_date_year'].value_counts().sort_index()
d2 = test['release_date_year'].value_counts().sort_index()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = d1.index, y = d1.values, name='train'))
fig.add_trace(go.Scatter(x = d2.index, y = d2.values, name='test'))

fig.update_layout(title='Number of movies released per year',
                   xaxis_title='Year',
                   yaxis_title='count')

fig.show()

## number of films per year and revenue per year

In [None]:
d1 = train['release_date_year'].value_counts().sort_index()
d2 = train.groupby(['release_date_year'])['revenue'].sum()

In [None]:
fig = fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x = d1.index, y = d1.values, name='film count'), secondary_y=False)
fig.add_trace(go.Scatter(x = d2.index, y = d2.values, name='total revenue'), secondary_y=True)



fig.update_layout(
    title_text='Number of movies released per year'
)

fig.update_xaxes(title_text='Year')

fig.update_yaxes(title_text="<b> count</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>revenue</b> ", secondary_y=True)

fig.show()




### let's do same by average revenue

In [None]:
d1 = train['release_date_year'].value_counts().sort_index()
d2 = train.groupby(['release_date_year'])['revenue'].mean()

In [None]:
fig = fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x = d1.index, y = d1.values, name='film count'), secondary_y=False)
fig.add_trace(go.Scatter(x = d2.index, y = d2.values, name='  Average revenue'), secondary_y=True)



fig.update_layout(
    title_text='Number of movies released per year'
)

fig.update_xaxes(title_text='Year')

fig.update_yaxes(title_text="<b> count</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Average revenue</b> ", secondary_y=True)

fig.show()

# Do release days matter?

In [None]:
sns.catplot(x='release_date_weekday',y='revenue',data=train);
plt.title('Revenue on different days of the week')

# Relationship between runtime and revenue

In [None]:
sns.distplot(train['runtime'].fillna(0)/60, bins=40 , kde=False)
plt.title('Distribution of the length of film in Hours')

In [None]:
sns.scatterplot(train['runtime'].fillna(0)/60,train['revenue'])
plt.title('Runtime vs  revenue')