# Importing Pandas

In [None]:
import pandas as pd

# Importing Data

In [None]:
names = ['id', 'title', 'year', 'rating', 'votes', 'length', 'genres']
data = pd.read_csv("imdb_top_10000.txt", sep = "\t", names = names, index_col = 0)

# Exploring our Data

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
# there is null data in genres column. filling it with 'no genre':

data['genres'].fillna('no genre', inplace = True)
data.info()

In [None]:
data.describe()

# Exporting Data

In [None]:
data.to_csv('test.csv', header = True, index = True, sep = ',')

# Sorting Data

In [None]:
data.sort_values(by = 'rating')

In [None]:
data.sort_values(by = 'rating', ascending = False)

# Creating Data Frames from Scratch

In [None]:
sample_data = {
    'tv': [1243,12352,132],
    'radio': [56,452,67],
    'news': [5436,67,3],
    'sales': [3,664,1]
}

In [None]:
data2 = pd.DataFrame(sample_data)

In [None]:
data2

In [None]:
del data2

In [None]:
try:
    data2
except:
    print('Data has been deleted')

# Selecting Data

In [None]:
data['title']

In [None]:
data[['title', 'year']]

In [None]:
data['rating'].mean()

In [None]:
data['rating'].min()

In [None]:
data['rating'].max()

In [None]:
data['genres'].unique()

In [None]:
data['rating'].value_counts()

In [None]:
data['rating'].value_counts().sort_index()

In [None]:
data['rating'].value_counts().sort_index(ascending = False)

# Plotting

In [None]:
%matplotlib inline

In [None]:
data.plot()

In [None]:
data.plot(kind = 'scatter', x = 'rating', y = 'votes')

In [None]:
data.plot(kind = 'scatter', x = 'rating', y = 'votes', alpha = 0.2)

In [None]:
data['rating'].plot(kind = 'hist')

In [None]:
import seaborn as sns

In [None]:
sns.lmplot(x = 'rating', y = 'votes', data = data)

In [None]:
sns.pairplot(data = data)

# Ordinary Least Squares (OLS) Regression

In [None]:
import statsmodels.api as sm

In [None]:
results = sm.OLS(data['votes'], data['rating']).fit()

In [None]:
results.summary()

In [None]:
data['year'] > 1995

In [None]:
data[data['year'] > 1995]

In [None]:
data[data['year'] > 1966]

In [None]:
data[(data['year'] > 1995) & (data['year'] < 2000)]

In [None]:
data[(data['year'] > 1995) | (data['year'] < 2000)]

In [None]:
data[(data['year'] > 1995) & (data['year'] < 2000)].sort_values(by = 'rating', ascending = False)

# Grouping

In [None]:
data.groupby(data['year'])['rating'].mean()

In [None]:
data.groupby(data['year'])['rating'].max()

In [None]:
data.groupby(data['year'])['rating'].min()

# Challenges

1. what was the highest scoring movie in 1996?
2. in what year was the highest rated movie of all time made?
3. what five movies has the most votes ever?
4. what year in the 1960s had the highest average movie rating

## Challenge No 1
what was the highest scoring movie in 1996

In [None]:
data[data['year'] == 1996].sort_values(by ='rating', ascending = False).head()

In [None]:
highest_rated_1996_movie = data[data['year'] == 1996].sort_values(by ='rating', ascending = False)[['title', 'rating']].head(2)
highest_rated_1996_movie

In [None]:
highest_rated_1996_movie['title']

### Alternative Solution

In [None]:
data_1996 = data[data['year'] == 1996]
data_1996

In [None]:
data_1996[data_1996['rating'] == data_1996['rating'].max()]

## Challenge No 2
in what year was the highest rated movie of all time made?

In [None]:
data.sort_values(by = 'rating', ascending = False).head()

In [None]:
highest_rated_movie = data.sort_values(by = 'rating', ascending = False).head(2)[['title', 'year', 'rating']]
highest_rated_movie

In [None]:
highest_rated_movie['year']

### Alternative Solution

In [None]:
data[data['rating'] == data['rating'].max()]

## Challenge No 3
what five movies has the most votes ever?

In [None]:
five_most_votes_movies = data.sort_values(by = 'votes', ascending = False).head()
five_most_votes_movies

In [None]:
five_most_votes_movies['title']

## Challenge No 4
what year in the 1960s had the highest average movie rating

In [None]:
average_1960s_rating = data[(data['year'] >= 1960) & (data['year'] < 1970)].groupby('year').mean()
average_1960s_rating

In [None]:
highest_average_1960s_rating = average_1960s_rating.sort_values(by = 'rating', ascending = False).head(1)
highest_average_1960s_rating

# Cleaning Data
Encountered problem:
1. the year on the title is redundant
2. the length type should be 'int' not 'object'
3. the genres should be separated

In [None]:
data.head()

In [None]:
data.info()

## Formatting Title

### Solution 1

In [None]:
data['formatted title'] = data['title'].str[:-7]
data.head()

### Solution 2

In [None]:
data['formatted title'] = data['title'].str.split(' \(').str[0]
data.head()

## Formatting Length
solution 1 and solution 2 above (redundant title problem) can be used on this case as well

In [None]:
data['formatted length'] = data['length'].str.replace('mins.','').astype('int')
data.head()

In [None]:
sns.pairplot(data)

#### It seems that there are some movie that has 0 length value, also 0 votes value

In [None]:
data[data['formatted length'] == 0].head()

In [None]:
data[data['votes'] == 0].head()

#### Apparently the 0 votes movie is non-existent, probably the votes are so low that they seem to be placed on 0 votes

In [None]:
data['votes'].min()

In [None]:
data['votes'].max()

## Formatting Genres

In [None]:
multiple_genres = data['genres'].unique()
multiple_genres

In [None]:
unique_genres = []
for genres in multiple_genres:
    genres = str(genres).split('|')
    for genre in genres:
        if genre not in unique_genres:
#            print(genre, type(genre), genre not in unique_genres)
            unique_genres.append(genre)

# removing 'no genre' genre from before
unique_genres.remove('no genre')
print(unique_genres)

In [None]:
# creating new data variable for genres and title and year

data_genres = data[['formatted title', 'year']].rename(columns = {'formatted title': 'title'})
data_genres

In [None]:
for genre in unique_genres:
    data_genres[genre] = data['genres'].str.contains(genre).astype('int')
data_genres.head()

In [None]:
data_genres['total genre type'] = 0

for genre in unique_genres:
    data_genres['total genre type'] += data_genres[genre]

data_genres.head()

In [None]:
data[data_genres['total genre type'] == 0]

In [None]:
data_genres['total genre type'].plot(kind = 'hist')

In [None]:
data_genres.groupby(data_genres['total genre type']).size()

# Finishing Up

In [None]:
data.head()

In [None]:
data_genres.head()

In [None]:
data_temp_1 = data
data_temp_1.head()

In [None]:
data_temp_1['title'] = data_temp_1['formatted title']
data_temp_1['length (mins)'] = data_temp_1['formatted length']
data_temp_1 = data_temp_1.drop(['length', 'genres', 'formatted length', 'formatted title'], axis = 1)
data_temp_1.head()

In [None]:
data_temp_2 = data_genres.drop(['title', 'year'], axis = 1)
data_temp_2.head()

In [None]:
complete_data = data_temp_1.join(data_temp_2)
complete_data.head()

In [None]:
complete_data.info()

In [None]:
complete_data.describe()

In [None]:
complete_data.to_csv('test_of_complete_processed_data.csv', header = True, index = True, sep = ',')