In [757]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack

films = pd.read_csv('films_clean.csv')


## Defining Features for Machine Learning

In [758]:
#Dropping all columns I won't need and keeping other for feature engineering
films = films.drop(columns= ['original_title', 'original_language','production_countries', 'production_companies','runtime','spoken_languages', 'vote_average', 'vote_count', 'director'])

## Drop the films with a NaN overview

In [759]:
films = films.drop(films[films['overview'] == 'No overview found.'].index)

## Splitting the dataset into male directed/female directed

In [760]:
female_directed= films[films['director_gender']== 'female']
male_directed= films[films['director_gender']== 'male']

In [761]:
female_directed

Unnamed: 0,genres,id,overview,popularity,title,year,director_gender
13,"Drama, Romance",8391,"A prudish woman, working on tenure as a litera...",3.302524,When Night Is Falling,1995,female
18,"Comedy, Romance",4482,"After learning of her husband's infidelities, ...",2.518051,French Twist,1995,female
44,"Drama, Romance",36834,"Young, wild poet Arthur Rimbaud and his mentor...",4.546552,Total Eclipse,1995,female
62,"Drama, Romance",40156,Father Greg Pilkington (Linus Roache) is torn ...,1.881932,Priest,1994,female
85,"Action, Drama, Adventure, Family",14522,"The fates of horses, and the people who own an...",8.398845,Black Beauty,1994,female
...,...,...,...,...,...,...,...
9548,Drama,88844,"Marian, a middle aged nurse, devotes herself t...",0.547712,Code Blue,2011,female
9549,"Drama, Romance",182981,"Naomi, a fifteen year-old Dutch girl from Sout...",0.420688,Nude Area,2014,female
9553,"Comedy, Drama",30019,"As young children, half-siblings Axel and Yann...",1.056898,Upperdog,2009,female
9557,"Comedy, Drama",366505,Alma’s family has been producing quality olive...,2.541633,The Olive Tree,2016,female


In [762]:
female_directed = female_directed.drop(columns='director_gender')
male_directed = male_directed.drop(columns='director_gender')

In [763]:
male_directed.shape

(8881, 6)

In [764]:
female_directed.dtypes

genres         object
id              int64
overview       object
popularity    float64
title          object
year            int64
dtype: object

## Conducting NLP on 'film overview' column

In [765]:
from sklearn.feature_extraction.text import CountVectorizer
text = films['overview']
text_male = male_directed['overview']
text_female = female_directed['overview']

In [766]:
text

0       An ugly duckling having undergone a remarkable...
1       When a lawyer shows up at the vampire's doorst...
2       Morgan Adams and her slave, William Shaw, are ...
3       The life of the gambling paradise – Las Vegas ...
4       Rich Mr. Dashwood dies, leaving his second wif...
                              ...                        
9574    The Sublet is a suspense driven psychological ...
9575    A stranger named Silas flees from a devastatin...
9576    Pretty, popular, and slim high-schooler Aly Sc...
9577    Hyperactive teenager Kelly is enrolled into a ...
9578    Yet another version of the classic epic, with ...
Name: overview, Length: 9539, dtype: object

In [767]:
vectorizer = CountVectorizer()
vectorizer.fit(text)

CountVectorizer()

In [768]:
vectors_male = vectorizer.transform(text_male).todense()
vectors_female = vectorizer.transform(text_female).todense()

In [769]:
vocabulary = vectorizer.get_feature_names_out()

In [770]:
male_vectorized = pd.DataFrame(vectors_male, columns=vocabulary)
female_vectorized = pd.DataFrame(vectors_female, columns=vocabulary)

In [771]:
male_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [772]:
female_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
656,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Normalizing Numerical Data: Popularity

In [773]:
m_normalized_popularity = male_directed['popularity']
f_normalized_popularity = female_directed['popularity']

In [774]:
m_normalized_popularity = m_normalized_popularity.values.reshape(-1, 1)

In [775]:
f_normalized_popularity = f_normalized_popularity.values.reshape(-1, 1)

In [776]:
scaler = StandardScaler()
m_normalized_popularity = scaler.fit_transform(m_normalized_popularity)
f_normalized_popularity = scaler.fit_transform(f_normalized_popularity)

In [777]:
m_normalized_popularity = pd.DataFrame(m_normalized_popularity)
f_normalized_popularity = pd.DataFrame(f_normalized_popularity)

## Normalizing Numerical Data: Year

In [778]:
m_normalized_year = male_directed['year']
f_normalized_year = female_directed['year']

In [779]:
m_normalized_year = m_normalized_year.values.reshape(-1, 1)
f_normalized_year = f_normalized_year.values.reshape(-1, 1)

In [780]:
scaler = StandardScaler()
m_normalized_year = scaler.fit_transform(m_normalized_year)
f_normalized_year = scaler.fit_transform(f_normalized_year)

In [781]:
m_normalized_year = pd.DataFrame(m_normalized_year)
f_normalized_year = pd.DataFrame(f_normalized_year)

In [782]:
f_normalized_year

Unnamed: 0,0
0,-0.669476
1,-0.669476
2,-0.669476
3,-0.744984
4,-0.744984
...,...
653,0.538656
654,0.765181
655,0.387640
656,0.916198


## Vectorizing Categorical Data

In [783]:
genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(films['genres'])
male_genres = genre_vectorizer.transform(male_directed['genres']).toarray()
female_genres = genre_vectorizer.transform(female_directed['genres']).toarray()
genre_vectorizer.vocabulary_

{'comedy': 3,
 'romance': 14,
 'horror': 11,
 'action': 0,
 'adventure': 1,
 'drama': 6,
 'crime': 4,
 'thriller': 16,
 'fantasy': 8,
 'sciencefiction': 15,
 'history': 10,
 'war': 18,
 'foreign': 9,
 'mystery': 13,
 'family': 7,
 'documentary': 5,
 'western': 19,
 'music': 12,
 'animation': 2,
 'tvmovie': 17}

In [784]:
male_genres = pd.DataFrame(male_genres, columns=genre_vectorizer.get_feature_names())
female_genres = pd.DataFrame(female_genres, columns=genre_vectorizer.get_feature_names())



In [785]:
female_genres

Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,foreign,history,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,war,western
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
655,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
656,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## Normalizing Categorical data

In [786]:
from sklearn.preprocessing import Normalizer

In [787]:
normalizer = Normalizer(norm='l2')

In [788]:
male_genres_overview = pd.concat([male_genres, male_vectorized], axis=1)
female_genres_overview = pd.concat([female_genres, female_vectorized], axis=1)

In [789]:
male_genres_overview

Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,foreign,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [790]:
female_genres_overview

Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,foreign,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
656,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [791]:
#male_vectorized = male_vectorized.values.reshape(-1, 1)
#female_vectorized = female_vectorized.values.reshape(-1, 1)

In [792]:

#M =  hstack([male_genres, male_vectorized])
#M =  hstack([male_genres, female_vectorized])

In [793]:
male_vectorized.shape

(8881, 35133)

In [794]:
#svd = TruncatedSVD(n_components=10)
#svd.fit_transform(male_vectorized)
#svd.fit_transform(female_vectorized)

In [795]:
male_genres

Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,foreign,history,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,war,western
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0
8877,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
8878,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
8879,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [796]:
male_genres_overview= pd.DataFrame(male_genres_overview)
female_genres_overview= pd.DataFrame(female_genres_overview)

## Concatenate a final dataframe before Machine Learning: female & male

In [807]:
final_female = pd.concat([female_genres_overview, f_normalized_year], axis=1)
final_male = pd.concat([male_genres_overview, m_normalized_year], axis=1)


In [808]:
male_data = final_male.values
female_data = final_female.values
n_neighbors = 5 
model = NearestNeighbors(n_neighbors=n_neighbors)
model.fit(female_data)


NearestNeighbors()

In [800]:
final_female

Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,foreign,...,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑,0
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.669476
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.669476
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.669476
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.744984
4,1,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,-0.744984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.538656
654,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.765181
655,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.387640
656,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.916198


In [809]:
distances, indices = model.kneighbors(male_data[0:1])

In [810]:
distances

array([[5.50311604, 5.83517255, 5.87095662, 6.01215874, 6.1648215 ]])

In [811]:
indices

array([[518, 652, 526, 314, 602]])

In [815]:
female_directed.iloc[[652]]

Unnamed: 0,genres,id,overview,popularity,title,year
9517,"Comedy, Romance",72363,"A love triangle between a businessman, his wif...",1.18741,I'm Staying,2003
