## Importing libraries

In [357]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
films = pd.read_csv('films_clean.csv')


## Defining Features for Machine Learning

In [358]:
#Dropping all columns I won't need and keeping other for feature engineering
films = films.drop(columns= ['original_title', 'original_language','production_countries', 'production_companies','runtime','spoken_languages', 'vote_average', 'vote_count', 'director'])

## Drop null values

In [451]:
#Drop the movies without a description
films = films.drop(films[films['overview'] == 'No overview found.'].index)

## Splitting the datasets

In [452]:
# Split the dataset into male directed and female directed movies
female_directed= films[films['director_gender']== 'female']
male_directed= films[films['director_gender']== 'male']

## Conducting NLP on 'film overview' column

In [364]:
from sklearn.feature_extraction.text import CountVectorizer
text = films['overview']
text_male = male_directed['overview']
text_female = female_directed['overview']

#I will use a stemmer to get rid of insignificant words

In [365]:

stemmer = EnglishStemmer()


In [367]:
default_analyzer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS).build_analyzer()

In [368]:
def analyze_with_stemming(text):
    unstemmed_words = default_analyzer(text)
    return (stemmer.stem(word) for word in unstemmed_words)

In [454]:
#I want to have an overview of what the stemmer has done
list(analyze_with_stemming(text[0]))

['ugli',
 'duckl',
 'have',
 'undergon',
 'remark',
 'chang',
 'harbor',
 'feel',
 'crush',
 'carefre',
 'playboy',
 'busi',
 'focus',
 'brother',
 'say']

In [371]:
stemmer_vectorizer = CountVectorizer(analyzer=analyze_with_stemming)

In [372]:
vectors = stemmer_vectorizer.fit(text)

In [373]:
male_vectorized = vectors.transform(text_male).todense()
female_vectorized = vectors.transform(text_female).todense()

In [374]:
vocabulary = vectors.get_feature_names_out()

In [375]:
male_vectorized

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [376]:
male_vectorized = pd.DataFrame(male_vectorized, columns=vocabulary)
female_vectorized = pd.DataFrame(female_vectorized, columns=vocabulary)

In [377]:
male_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [378]:
female_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
654,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
656,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Normalizing Numerical Data: Popularity

In [379]:
m_normalized_popularity = male_directed['popularity']
f_normalized_popularity = female_directed['popularity']

In [380]:
m_normalized_popularity = m_normalized_popularity.values.reshape(-1, 1)

In [381]:
f_normalized_popularity = f_normalized_popularity.values.reshape(-1, 1)

In [382]:
from sklearn.preprocessing import StandardScaler

In [383]:
scaler = StandardScaler()
m_normalized_popularity = scaler.fit_transform(m_normalized_popularity)
f_normalized_popularity = scaler.fit_transform(f_normalized_popularity)

In [384]:
m_normalized_popularity = pd.DataFrame(m_normalized_popularity)
f_normalized_popularity = pd.DataFrame(f_normalized_popularity)

## Normalizing Numerical Data: Year

In [385]:
m_normalized_year = male_directed['year']
f_normalized_year = female_directed['year']

In [386]:
m_normalized_year = m_normalized_year.values.reshape(-1, 1)
f_normalized_year = f_normalized_year.values.reshape(-1, 1)

In [387]:
scaler = StandardScaler()
m_normalized_year = scaler.fit_transform(m_normalized_year)
f_normalized_year = scaler.fit_transform(f_normalized_year)

In [388]:
m_normalized_year = pd.DataFrame(m_normalized_year)
f_normalized_year = pd.DataFrame(f_normalized_year)

In [389]:
f_normalized_year

Unnamed: 0,0
0,-0.669476
1,-0.669476
2,-0.669476
3,-0.744984
4,-0.744984
...,...
653,0.538656
654,0.765181
655,0.387640
656,0.916198


## Vectorizing Categorical Data

In [390]:
genre_vectorizer = CountVectorizer()
genre_vectorizer.fit(films['genres'])
male_genres = genre_vectorizer.transform(male_directed['genres']).toarray()
female_genres = genre_vectorizer.transform(female_directed['genres']).toarray()
genre_vectorizer.vocabulary_

{'comedy': 3,
 'romance': 14,
 'horror': 11,
 'action': 0,
 'adventure': 1,
 'drama': 6,
 'crime': 4,
 'thriller': 16,
 'fantasy': 8,
 'sciencefiction': 15,
 'history': 10,
 'war': 18,
 'foreign': 9,
 'mystery': 13,
 'family': 7,
 'documentary': 5,
 'western': 19,
 'music': 12,
 'animation': 2,
 'tvmovie': 17}

In [391]:
male_genres = pd.DataFrame(male_genres, columns=genre_vectorizer.get_feature_names())
female_genres = pd.DataFrame(female_genres, columns=genre_vectorizer.get_feature_names())



In [392]:
type(female_genres)

pandas.core.frame.DataFrame

## Normalizing Categorical data

In [393]:
from sklearn.preprocessing import Normalizer

In [394]:
normalizer = Normalizer(norm='l2')

In [395]:
male_genres

Unnamed: 0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,foreign,history,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,war,western
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0
8877,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0
8878,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
8879,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [396]:
male_genres_overview = pd.concat([male_genres, male_vectorized], axis=1)
female_genres_overview = pd.concat([female_genres, female_vectorized], axis=1)

In [397]:
normalizer.fit_transform(male_genres_overview)
normalizer.fit_transform(female_genres_overview)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [400]:
male_vectorized

Unnamed: 0,00,000,000th,007,01,04,07am,10,100,1000,...,गल,ஆதவன,யப,ரம,ரமண,たけみかずち,ひめ,주식회사,찾기,첫사랑
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8876,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8877,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8878,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Concatenate a final dataframe before Machine Learning: female & male

In [444]:
female_movies = pd.concat([female_genres_overview, f_normalized_year], axis=1)
male_movies = pd.concat([male_genres_overview, m_normalized_year], axis=1)


In [445]:
n_neighbors = 5
model = NearestNeighbors(n_neighbors=n_neighbors)
model.fit(female_movies)




NearestNeighbors()

In [447]:
distances, indices = model.kneighbors(male_movies[8880:8881])



In [446]:
male_directed[8880:8881]

Unnamed: 0,index,genres,id,overview,popularity,title,year,director_gender
8880,8880,"Action, Drama, Romance",30840,"Yet another version of the classic epic, with ...",5.683753,Robin Hood,1991,male


In [422]:
distances

array([[6.25117237, 6.25791077, 6.32698373, 6.41571876, 6.48198012]])

In [448]:
indices

array([[107, 470, 518, 314, 395]])

In [425]:
male_directed.iloc[[67]]

Unnamed: 0,genres,id,overview,popularity,title,year,director_gender
67,"Comedy, Drama, Romance",36614,"""The Sum of Us"" is an Aussie story about a fat...",1.407427,The Sum of Us,1994,male


In [449]:
female_directed.iloc[[107, 470, 518, 314, 395]]

Unnamed: 0,genres,id,overview,popularity,title,year,director_gender
107,"Drama, Romance",28200,Story of the relationship between the poets Te...,11.37237,Sylvia,2003,female
470,"Drama, Romance",57327,The end of an affair from the woman's point of...,0.410894,Post Coitum,1997,female
518,"Comedy, Drama, Romance",214129,An ugly ducking attempts to become desirable.,0.834795,Bakit Hindi Ka Crush Ng Crush Mo?,2013,female
314,"Comedy, Romance",75802,A romantic comedy about the invention of the v...,14.331454,Hysteria,2011,female
395,Drama,155890,A banker strikes up a sadomasochistic relation...,1.249059,Tied,2013,female


In [450]:
female_directed

Unnamed: 0,genres,id,overview,popularity,title,year,director_gender
0,"Drama, Romance",8391,"A prudish woman, working on tenure as a litera...",3.302524,When Night Is Falling,1995,female
1,"Comedy, Romance",4482,"After learning of her husband's infidelities, ...",2.518051,French Twist,1995,female
2,"Drama, Romance",36834,"Young, wild poet Arthur Rimbaud and his mentor...",4.546552,Total Eclipse,1995,female
3,"Drama, Romance",40156,Father Greg Pilkington (Linus Roache) is torn ...,1.881932,Priest,1994,female
4,"Action, Drama, Adventure, Family",14522,"The fates of horses, and the people who own an...",8.398845,Black Beauty,1994,female
...,...,...,...,...,...,...,...
653,Drama,88844,"Marian, a middle aged nurse, devotes herself t...",0.547712,Code Blue,2011,female
654,"Drama, Romance",182981,"Naomi, a fifteen year-old Dutch girl from Sout...",0.420688,Nude Area,2014,female
655,"Comedy, Drama",30019,"As young children, half-siblings Axel and Yann...",1.056898,Upperdog,2009,female
656,"Comedy, Drama",366505,Alma’s family has been producing quality olive...,2.541633,The Olive Tree,2016,female


## User interface

In [443]:
female_directed= female_directed.reset_index(drop=True)

In [416]:
male_directed.reset_index(drop=True, inplace=True)

In [417]:
male_directed.tail()

Unnamed: 0,genres,id,overview,popularity,title,year,director_gender
8876,"Drama, Thriller, Mystery, Horror",365432,The Sublet is a suspense driven psychological ...,1.339355,The Sublet,2015,male
8877,"Action, Thriller, Mystery, Horror",45527,A stranger named Silas flees from a devastatin...,1.270832,The Final Storm,2010,male
8878,"Drama, Family, TVMovie",39562,"Pretty, popular, and slim high-schooler Aly Sc...",0.767762,To Be Fat Like Me,2007,male
8879,Comedy,14008,Hyperactive teenager Kelly is enrolled into a ...,4.392389,Cadet Kelly,2002,male
8880,"Action, Drama, Romance",30840,"Yet another version of the classic epic, with ...",5.683753,Robin Hood,1991,male


In [418]:
print("Male_directed Movie Titles:")
for i, row in male_directed.iterrows():
    print(f"{i}: {row['title']}")

Male_directed Movie Titles:
0: Sabrina
1: Dracula: Dead and Loving It
2: Cutthroat Island
3: Casino
4: Sense and Sensibility
5: Assassins
6: Othello
7: Persuasion
8: The City of Lost Children
9: Shanghai Triad
10: Carrington
11: Dead Man Walking
12: Richard III
13: Lamerica
14: Georgia
15: The Postman
16: The Confessional
17: Les Miserables
18: The White Balloon
19: A Midwinter's Tale
20: La Haine
21: Shopping
22: Heidi Fleiss: Hollywood Madam
23: Rumble in the Bronx
24: Margaret's Museum
25: Happiness Is in the Field
26: The Boys of St. Vincent
27: Chungking Express
28: Frankie Starlight
29: Nueba Yol
30: The Neon Bible
31: An Awfully Big Adventure
32: Amateur
33: Rob Roy
34: Belle de Jour
35: Canadian Bacon
36: Johnny Mnemonic
37: Love & Human Remains
38: Moonlight and Valentino
39: Mute Witness
40: Safe
41: The Browning Version
42: Circle of Friends
43: Death and the Maiden
44: Eat Drink Man Woman
45: Exotica
46: French Kiss
47: The Glass Shield
48: Heavenly Creatures
49: Ladybird L

In [419]:
#chosen_index = int(input("Enter the index of the movie you want: "))
#chosen_movie = male_data[chosen_index]
#chosen_distance = distances[0][chosen_index]
#chosen_indices = indices[0][:3]
#
#print(f"Chosen Movie: {chosen_movie}")
#print(f"Distance: {chosen_distance}")
#print(f"Indices: {chosen_indices}")

KeyboardInterrupt: Interrupted by user

In [433]:
male_directed.reset_index(inplace=True)

In [434]:
def get_title_from_index(index):
    return male_directed[male_directed.index == index]['title'].values[0]
def get_index_from_title(title):
    return male_directed[male_directed.title == title]['index'].values[0]


#input("Enter the index of the movie you want: ")



In [440]:
get_index_from_title('Cadet Kelly')

8879