# Question 1

In this question we try to predict the best category for each year.
(Based on the past years)

***Importing libraries***

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import mean_squared_error

***Reading the dataset file***

In [2]:
file_path = './movie_dataset.csv'
movies = pd.read_csv(file_path)

# Preprocessing the data

Here we want to get the best category for each year.

In [3]:
movies.release_date.unique()

array(['2009-12-10', '2007-05-19', '2015-10-26', ..., '2011-12-26',
       '2013-10-13', '2012-05-03'], dtype=object)

We can see that the dates are not "year" only so we need add a column called year and for each movie save the year of the release of the movie.

In [4]:
release_format = pd.to_datetime(movies['release_date'])
release_format

0      2009-12-10
1      2007-05-19
2      2015-10-26
3      2012-07-16
4      2012-03-07
          ...    
4798   1992-09-04
4799   2011-12-26
4800   2013-10-13
4801   2012-05-03
4802   2005-08-05
Name: release_date, Length: 4803, dtype: datetime64[ns]

reference: https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.html

In [5]:
movies['year'] = release_format.dt.year
movies['year']

0       2009.0
1       2007.0
2       2015.0
3       2012.0
4       2012.0
         ...  
4798    1992.0
4799    2011.0
4800    2013.0
4801    2012.0
4802    2005.0
Name: year, Length: 4803, dtype: float64

***reference***

nlargest:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.nlargest.html

Groupby:

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html

https://www.geeksforgeeks.org/python-pandas-dataframe-groupby/

In [6]:
# here we sort the movies in ascending order, by year and revenue
movies_sorted = movies.sort_values(['year', 'revenue'], ascending=[True, True])

# now we get the top 3 highest grossing movies in each year
top_movies = movies_sorted.groupby('year').apply(lambda x: x.nlargest(3, 'revenue')).reset_index(drop=True)

# here we remove the movies with revenue == 0
top_movies = top_movies.loc[top_movies.revenue > 0]

top_movies.reset_index()
top_movies = top_movies.drop('index', axis=1)
top_movies

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,year
0,385907,Drama,,3059,usa naivety intolerance mill marriage,en,Intolerance,"The story of a poor young woman, separated by ...",3.232447,"[{""name"": ""Triangle Film Corporation"", ""id"": 1...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Cruel Hand of Intolerance,Intolerance,7.4,60,Lillian Gish Mae Marsh Robert Harron F.A. Turn...,"[{'name': 'Tod Browning', 'gender': 2, 'depart...",D.W. Griffith,1916.0
1,245000,Drama Romance War,,3060,world war i silent film,en,The Big Parade,The story of an idle rich boy who joins the US...,0.785744,"[{""name"": ""Metro-Goldwyn-Mayer (MGM)"", ""id"": 8...",...,[],Released,,The Big Parade,7.0,21,John Gilbert Ren\u00e9e Ador\u00e9e Hobart Bos...,"[{'name': 'Irving Thalberg', 'gender': 2, 'dep...",King Vidor,1925.0
2,92620000,Drama Science Fiction,,19,man vs machine underground world inventor metr...,de,Metropolis,In a futuristic city sharply divided between t...,32.351527,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""na...",...,"[{""iso_639_1"": ""xx"", ""name"": ""No Language""}]",Released,There can be no understanding between the hand...,Metropolis,8.0,657,Brigitte Helm Alfred Abel Gustav Fr\u00f6hlich...,"[{'name': 'Erich Pommer', 'gender': 2, 'depart...",Fritz Lang,1927.0
3,379000,Drama Music Romance,,65203,musical singer pre-code wisecrack humor early ...,en,The Broadway Melody,"Harriet and Queenie Mahoney, a vaudeville act,...",0.968865,"[{""name"": ""Metro-Goldwyn-Mayer (MGM)"", ""id"": 8...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The pulsating drama of Broadway's bared heart ...,The Broadway Melody,5.0,19,Charles King Anita Page Bessie Love Nacio Herb...,"[{'name': 'Irving Thalberg', 'gender': 2, 'dep...",Harry Beaumont,1929.0
5,3950000,Action Drama History,,22301,world war i zeppelin royal air force royal fly...,en,Hell's Angels,Two brothers attending Oxford enlist with the ...,8.484123,"[{""name"": ""The Caddo Company"", ""id"": 13866}]",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,Howard Hughes' Thrilling Multi-Million Dollar ...,Hell's Angels,6.1,19,Ben Lyon James Hall Jean Harlow John Darrow Lu...,"[{'name': 'James Whale', 'gender': 2, 'departm...",James Whale,1930.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,190000000,Action,http://www.furious7.com/,168259,car race speed revenge suspense car,en,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...,102.322217,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Vengeance Hits Home,Furious 7,7.3,4176,Vin Diesel Paul Walker Dwayne Johnson Michelle...,"[{'name': 'James Wan', 'gender': 2, 'departmen...",James Wan,2015.0
233,280000000,Action Adventure Science Fiction,http://marvel.com/movies/movie/193/avengers_ag...,99861,marvel comic sequel superhero based on comic b...,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,134.279229,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Age Has Come.,Avengers: Age of Ultron,7.3,6767,Robert Downey Jr. Chris Hemsworth Mark Ruffalo...,"[{'name': 'Danny Elfman', 'gender': 2, 'depart...",Joss Whedon,2015.0
234,250000000,Adventure Action Science Fiction,http://marvel.com/captainamericapremiere,271110,civil war war marvel comic sequel superhero,en,Captain America: Civil War,"Following the events of Age of Ultron, the col...",198.372395,"[{""name"": ""Studio Babelsberg"", ""id"": 264}, {""n...",...,"[{""iso_639_1"": ""ro"", ""name"": ""Rom\u00e2n\u0103...",Released,Divided We Fall,Captain America: Civil War,7.1,7241,Chris Evans Robert Downey Jr. Scarlett Johanss...,"[{'name': 'Ronald R. Reiss', 'gender': 2, 'dep...",Anthony Russo,2016.0
235,175000000,Family Adventure Drama Fantasy,http://movies.disney.com/the-jungle-book-2016,278927,based on novel snake wolf elephant tiger,en,The Jungle Book,After a threat from the tiger Shere Khan force...,94.199316,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,The Jungle Book,6.7,2892,Neel Sethi Bill Murray Ben Kingsley Idris Elba...,"[{'name': 'John Debney', 'gender': 2, 'departm...",Jon Favreau,2016.0


***Reference***

https://www.geeksforgeeks.org/create-a-pandas-dataframe-from-list-of-dicts/

In [7]:
category_counts = {}

# iterating over each year and count the number of occurrences of each genre.
for year, data_ in top_movies.groupby('year'):
    category_counts[year] = {}
    for _, movie in data_.iterrows():
        categories = movie['genres'].split()
#         print(f'category = [{len(categories)}]')
        for category in categories:
            if category in category_counts[year]:
                category_counts[year][category] += 1
            else:
                category_counts[year][category] = 1

                
# here we create a list of dictionaries to store the data we want:
# for each year we choose the genre that appears most of the times.
best_categories_list = [{'Year': year, 'Category': max(category_counts[year], key=category_counts[year].get), 'Occurrences': max(category_counts[year].values())} for year in category_counts]

# creating a dataframe using the List of Dictionaries
best_categories = pd.DataFrame(best_categories_list)

best_categories

Unnamed: 0,Year,Category,Occurrences
0,1916.0,Drama,1
1,1925.0,Drama,1
2,1927.0,Drama,1
3,1929.0,Drama,1
4,1930.0,Action,1
...,...,...,...
84,2012.0,Action,3
85,2013.0,Animation,2
86,2014.0,Action,3
87,2015.0,Action,3


***Using Logistic regression to classify to genre/category***

In [9]:
# using one hot encoding
encoded_categories = pd.get_dummies(best_categories['Category'], prefix='Category')

# combining the encoded result with the other features
features = pd.concat([best_categories[['Year', 'Occurrences']], encoded_categories], axis=1)

y = best_categories['Category']

X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=42)

# training the model
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)

# sve the predictions
y_pred = logistic_regression.predict(X_test)

# measuring the preformence
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9444444444444444
Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1-score: 0.6666666666666666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

       Actual  Predicted
44      Drama      Drama
53  Adventure  Adventure
30      Drama      Drama
12      Drama      Drama
49  Adventure  Adventure
0       Drama      Drama
64     Family     Family
18      Drama      Drama
10    Fantasy     Comedy
22     Action     Action
4      Action     Action
33      Drama      Drama
42      Drama      Drama
80     Action     Action
81  Adventure  Adventure
39     Action     Action
77  Adventure  Adventure
61  Adventure  Adventure


***Using KNN algorithm***

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [12]:
# scale the feature values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# train the model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8888888888888888
Precision: 0.5555555555555556
Recall: 0.6
F1-score: 0.575


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

       Actual  Predicted
44      Drama      Drama
53  Adventure  Adventure
30      Drama      Drama
12      Drama      Drama
49  Adventure  Adventure
0       Drama      Drama
64     Family      Drama
18      Drama      Drama
10    Fantasy      Drama
22     Action     Action
4      Action     Action
33      Drama      Drama
42      Drama      Drama
80     Action     Action
81  Adventure  Adventure
39     Action     Action
77  Adventure  Adventure
61  Adventure  Adventure
