#### Import dependencies

In [1]:
import bs4, re, time, requests, sklearn, gensim, sys, os, time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
modules = [requests, bs4, re, pd, np, sklearn, gensim]

print("Modules used for this project:")
print(f"  - python v{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
for m in modules:
    print(f"  - {m.__name__} v{m.__version__}")

Modules used for this project:
  - python v3.9.2
  - requests v2.25.1
  - bs4 v4.9.3
  - re v2.2.1
  - pandas v1.2.3
  - numpy v1.19.2
  - sklearn v0.24.1
  - gensim v3.8.3


In [3]:
pd.options.display.max_colwidth = 300

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
%run ./support_functions.ipynb

In [6]:
%run ./project_algos.ipynb

## Import data, generate models

In [8]:
#del df1, df2, df3, d4, d5
#del cossim1, cossim2, cossim3, cossim4, cossim5
data = pd.read_csv("data_v4.csv")

In [9]:
backup_data = data.copy()

In [12]:
times = []

Generate TFIDF vectors and cosine matrix

In [13]:
start = time.time()
df1, cossim1, movie_index, duplicates = setup(data.copy(), tfidf=True)
stop = time.time()
times.append(stop-start)

Generate Doc2Vec vectors and cosine similarity matrices with feature vector of 10, 25, 50, and 100 elements, respectively

In [15]:
start = time.time()
df2, cossim2, _, _ = setup(data.copy(), vec_size=10)
stop = time.time()
times.append(stop-start)

In [16]:
start = time.time()
df3, cossim3, _, _ = setup(data.copy(), vec_size=25)
stop = time.time()
times.append(stop-start)

In [17]:
start = time.time()
df4, cossim4, _, _ = setup(data.copy(), vec_size=50)
stop = time.time()
times.append(stop-start)

In [18]:
start = time.time()
df5, cossim5, _, _ = setup(data.copy(), vec_size=100)
stop = time.time()
times.append(stop-start)

In [19]:
np.save("times", times)

In [20]:
times

[7.428581237792969,
 198.28604388237,
 203.4817178249359,
 207.01334142684937,
 207.08093357086182]

In [21]:
vecs = df1.vector

In [36]:
print(f"TFIDF vectors: {sys.getsizeof(vecs):21,} bytes")
print(f"TFIDF cossim1: {sys.getsizeof(cossim1):21,} bytes in {times[0]:7.3f} seconds")
print(f"D2V cossim2, 10 vec: {sys.getsizeof(cossim2.index):15,} bytes in {times[1]:0.3f} seconds")
print(f"D2V cossim3, 25 vec: {sys.getsizeof(cossim3.index):15,} bytes in {times[2]:0.3f} seconds")
print(f"D2V cossim4, 50 vec: {sys.getsizeof(cossim4.index):15,} bytes in {times[3]:0.3f} seconds")
print(f"D2V cossim5, 50 vec: {sys.getsizeof(cossim5.index):15,} bytes in {times[4]:0.3f} seconds")

TFIDF vectors:             4,189,576 bytes
TFIDF cossim1:        12,981,760,824 bytes in   7.429 seconds
D2V cossim2, 10 vec:       1,611,432 bytes in 198.286 seconds
D2V cossim3, 25 vec:       4,028,412 bytes in 203.482 seconds
D2V cossim4, 50 vec:       8,056,712 bytes in 207.013 seconds
D2V cossim5, 50 vec:      16,113,312 bytes in 207.081 seconds


## Check dataframes

In [37]:
head(data,2)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year
0,tt0066026,Mash,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt, Sally Kellerman",The staff of a Korean War field hospital use humor and high jinks to keep their sanity in the face of the horror of war.,1970
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George, Martin Balsam","Jack Crabb, looking back from extreme old age, tells of his life being raised by Native Americans and fighting with General Custer.",1970


In [38]:
head(df1, 2)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year,vector
0,tt0066026,Mash,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt, Sally Kellerman",The staff of a Korean War field hospital use humor and high jinks to keep their sanity in the face of the horror of war.,1970,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George, Martin Balsam","Jack Crabb, looking back from extreme old age, tells of his life being raised by Native Americans and fighting with General Custer.",1970,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [39]:
head(df2, 2)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year,vector
0,tt0066026,Mash,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt, Sally Kellerman",The staff of a Korean War field hospital use humor and high jinks to keep their sanity in the face of the horror of war.,1970,"[0.66858774, 0.5035114, -0.05596466, 0.20096123, -0.16959098, 0.63864076, 0.3058893, -0.91855097, 0.5144075, 0.36446914]"
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George, Martin Balsam","Jack Crabb, looking back from extreme old age, tells of his life being raised by Native Americans and fighting with General Custer.",1970,"[0.9083052, 0.99575204, 0.34034598, 1.7585219, -0.79966307, 0.22148633, -0.68919164, -0.17331912, 0.3015805, -0.98604286]"


In [40]:
head(df3, 2)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year,vector
0,tt0066026,Mash,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt, Sally Kellerman",The staff of a Korean War field hospital use humor and high jinks to keep their sanity in the face of the horror of war.,1970,"[0.48843127, 0.23576516, -0.4971744, 0.061640844, 1.0843799, 0.53327656, -0.07116237, -1.007941, 1.2093432, -0.6567683, -0.6324189, -1.1912214, 0.31969148, -0.2645797, 1.4001079, 0.40893745, -0.13410616, -0.7382487, 1.0042787, 1.4828411, 0.8467775, 0.5656017, 0.6357053, 0.1326554, 0.5853657]"
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George, Martin Balsam","Jack Crabb, looking back from extreme old age, tells of his life being raised by Native Americans and fighting with General Custer.",1970,"[1.1699567, 0.0048573418, -1.1310339, 0.20571552, -0.4032766, 0.8644904, -0.32345444, -0.023487847, 0.17966157, -0.5405398, -1.5452095, -0.9210707, 1.7680256, 0.99038625, -1.2097617, 1.0375329, 0.28576848, -0.54948884, 1.207061, 2.2689402, 0.6778278, 0.7070185, -1.0979124, -0.6253758, -0.2613667]"


In [41]:
head(df4, 2)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year,vector
0,tt0066026,Mash,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt, Sally Kellerman",The staff of a Korean War field hospital use humor and high jinks to keep their sanity in the face of the horror of war.,1970,"[0.55529165, -0.030563917, -0.15469606, -0.35724148, 0.24344909, 0.5174363, -0.47875154, -0.8683204, -0.00836391, -1.5436417, -0.92389226, -0.8801702, 0.8756326, 1.644272, 0.5479133, 0.46895942, 0.36023208, -0.5361242, 0.9278787, 1.455791, -0.03661401, 0.011641419, 0.4441431, 0.74182993, 0.09546..."
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George, Martin Balsam","Jack Crabb, looking back from extreme old age, tells of his life being raised by Native Americans and fighting with General Custer.",1970,"[1.3986665, 0.060273845, -1.1085699, 1.3135228, -0.93644357, 1.0214444, -0.579783, -0.59201527, -0.46989954, -1.5635799, 0.14075026, 0.512299, 0.0727316, 1.2425766, -0.8716747, -0.014566928, 0.8081368, 0.29999396, 0.6228659, 1.6400483, 0.12439249, -0.2153739, -0.69080305, 0.67501956, 0.48737076,..."


In [42]:
head(df5, 2)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year,vector
0,tt0066026,Mash,"Comedy, Drama, War",R,7.4,67665,116,Robert Altman,"Donald Sutherland, Elliott Gould, Tom Skerritt, Sally Kellerman",The staff of a Korean War field hospital use humor and high jinks to keep their sanity in the face of the horror of war.,1970,"[0.25917852, -0.30149174, -0.26702914, -0.00038738328, -0.06330841, 0.59687144, -0.6743027, -0.38444927, -0.48552206, 0.01878523, -0.13206711, -0.92466956, 0.041813675, 0.6301619, 0.06961605, 0.3045095, 0.663126, -0.49521363, 0.2159518, 0.7861044, 0.72507685, 0.5866445, -0.14609729, -0.3541358, ..."
1,tt0065988,Little Big Man,"Adventure, Comedy, Drama",PG-13,7.6,32952,139,Arthur Penn,"Dustin Hoffman, Faye Dunaway, Chief Dan George, Martin Balsam","Jack Crabb, looking back from extreme old age, tells of his life being raised by Native Americans and fighting with General Custer.",1970,"[0.8722944, -0.276538, 0.1028088, 0.8010518, 0.57279265, -0.21269044, -0.8556541, -0.48250705, 0.46135575, -1.114653, 0.06269224, 0.3777543, 0.20354284, 0.1252248, 0.293417, -0.21226127, 0.6286513, -0.3662587, -0.07599297, 0.7956106, -0.07835287, 0.23447642, -1.1391407, -0.100536294, 0.33670774,..."


## Compare output

In [45]:
movie = "The Avengers"

In [46]:
movie_search(movie, data)

Unnamed: 0,uids,titles,genres,ratings,scores,votes,lengths,directors,stars,descriptions,year
10638,tt0118661,The Avengers,"Action, Adventure, Sci-Fi",PG-13,3.8,41085,89,Jeremiah S. Chechik,"Ralph Fiennes, Uma Thurman, Sean Connery, Patrick Macnee",Two British Agents team up to stop Sir August de Wynter from destroying the world with a weather-changing machine.,1998
25246,tt0848228,The Avengers,"Action, Adventure, Sci-Fi",PG-13,8.0,1266025,143,Joss Whedon,"Robert Downey Jr., Chris Evans, Scarlett Johansson, Jeremy Renner",Earth's mightiest heroes must come together and learn to fight as a team if they are going to stop the mischievous Loki and his alien army from enslaving humanity.,2012


In [47]:
recommend_byTitle(movie, cossim1, df1, tfidf=True)

The Avengers is a duplicate.


Unnamed: 0,Rank,Film,Year,Genre,Rating,Runtime,Director,Also starring,Description,Score
0,1,Avengers: Age Of Ultron,2015,"Action, Adventure, Sci-Fi",PG-13,141,Joss Whedon,"Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth","When Tony Stark and Bruce Banner try to jump-start a dormant peacekeeping program called Ultron, things go horribly wrong and it's up to Earth's mightiest heroes to stop the villainous Ultron from enacting his terrible plan.",7.298481
1,2,Hellboy Ii: The Golden Army,2008,"Action, Adventure, Fantasy",PG-13,120,Guillermo del Toro,"Ron Perlman, Selma Blair, Doug Jones, John Alexander","A prince of the mythical world starts a rebellion against humanity in order to rule the Earth, and Hellboy his team must fight to stop him from locating the all-powerful Golden Army.",6.996735
2,3,Titan A.E.,2000,"Animation, Action, Adventure",PG,94,Don Bluth,"Matt Damon, Drew Barrymore, Bill Pullman, Jim Breuer",A young man learns that he has to find a hidden Earth ship before an enemy alien species does in order to secure the survival of humanity.,6.592638
3,4,Lost In Space,1998,"Action, Adventure, Family",PG-13,130,Stephen Hopkins,"Gary Oldman, William Hurt, Matt LeBlanc, Mimi Rogers",The Robinson family was going into space to fight for a chance for humanity. Now they are fighting to live long enough to find a way home.,5.213613
4,5,Skylines,2020,"Action, Adventure, Sci-Fi",R,113,Liam O'Donnell,"Lindsey Morgan, Jonathan Howard, Daniel Bernhardt, Rhona Mitra","When a virus threatens to turn the now earth-dwelling friendly alien hybrids against humans, Captain Rose Corley must lead a team of elite mercenaries on a mission to the alien world in order to save what's left of humanity.",5.044015


In [48]:
recommend_byTitle(movie, cossim2, df2)

The Avengers is a duplicate.


Unnamed: 0,Rank,Film,Year,Genre,Rating,Runtime,Director,Also starring,Description,Score
0,1,Heavenly Sword,2014,"Animation, Action, Adventure",Not Rated,85,Gun Ho Jang,"Anna Torv, Alfred Molina, Thomas Jane, Ashleigh Ball","A power-hungry ruler known as King Bohan seeks to obtain the long-protected ""Heavenly Sword"" in order to use its devastating power for evil. Why doesn't anyone ever use immensely powerful swords for good anymore?",5.178006
1,2,Quintet,1979,"Drama, Mystery, Sci-Fi",R,118,Robert Altman,"Paul Newman, Vittorio Gassman, Fernando Rey, Bibi Andersson","During a future ice age, dying humanity occupies its remaining time by playing a board game called ""Quintet."" For one small group, this obsession is not enough; they play the game with living pieces ... and only the winner survives.",5.062453
2,3,Bloodsport 2,1996,"Action, Sport",R,90,Alan Mehrez,"Daniel Bernhardt, James Hong, Pat Morita, Donald Gibb","A talented street fighter ends up in a torturous prison camp - where he meets Sun, who teaches him humility, self-respect, and the deadliest fighting discipline: The Iron Hand. He promises Sun upon his release, he will fight in The Kumite.",4.83879
3,4,Allan Quatermain And The Lost City Of Gold,1986,"Action, Adventure, Comedy",PG,99,Gary Nelson,"Richard Chamberlain, Sharon Stone, James Earl Jones, Henry Silva","Allan Quatermain once again teams up with Jesse Huston where the discovery of a mysterious old gold piece sends Quatermain looking for his long-lost brother, missing in the wilds of Africa after seeking a lost white race.",4.635519
4,5,Scavengers,2013,"Action, Sci-Fi, Thriller",Not Rated,94,Travis Zariwny,"Sean Patrick Flanery, Jeremy London, Louise Linton, Kelley Whilden","A team of space scavengers discovers superior alien technology that threatens the balance of the known universe. Hotly pursued by a rival crew of intergalactic mercenaries, the Revelator crew must fight through the deepest reaches of space to locate and protect the life-altering device.",3.96383


In [49]:
recommend_byTitle(movie, cossim3, df3)

The Avengers is a duplicate.


Unnamed: 0,Rank,Film,Year,Genre,Rating,Runtime,Director,Also starring,Description,Score
0,1,The Little Devil,1988,"Comedy, Fantasy, Romance",,101,Roberto Benigni,"Roberto Benigni, Walter Matthau, Stefania Sandrelli, Giacomo Piperno","During an exorcism, father Maurice meets a little devil named Giuditta, who refuses to return to hell and decides to discover the world.",6.773512
1,2,Oggy And The Cockroaches: The Movie,2013,"Animation, Adventure, Comedy",,80,Olivier Jean Marie,"Shailesh Pandey, Hugues Le Bars","Ever since the world was born, two forces have been locked in perpetual battle. Their struggle is so Manichean, so ferocious, so Herculean that it makes the clash between good and evil look...",6.066024
2,3,Swift,2019,"Animation, Adventure, Comedy",Not Rated,90,Andrea Block,"Kate Winslet, Willem Dafoe, Josh Keaton, Cassandra Steen",The little swift Manou grows up believing he's a seagull learning to fly he finds out he never will be. When he runs from home he meets birds of his own species and finds out who he really is.,5.70303
3,4,Paragraf 78 - Film Vtoroy,2007,"Action, Sci-Fi, Thriller",,90,Mikhail Khleborodov,"Yuriy Kutsenko, Vladimir Vdovichenkov, Anastasiya Slanevskaya, Anatoliy Belyy","In the near future the command of the special setting gets a new important task. On a secret rocket base, where a very tall order brings a command over, they will have face to face to run ...",5.40622
4,5,Paragraf 78,2007,"Action, Sci-Fi, Thriller",Not Rated,88,Mikhail Khleborodov,"Yuriy Kutsenko, Vladimir Vdovichenkov, Anastasiya Slanevskaya, Grigoriy Siyatvinda","In the near future the command of the special setting gets a new important task. On a secret rocket base, where a very tall order brings a command over, they will have face to face to run ...",5.261197


In [50]:
recommend_byTitle(movie, cossim4, df4)

The Avengers is a duplicate.


Unnamed: 0,Rank,Film,Year,Genre,Rating,Runtime,Director,Also starring,Description,Score
0,1,Fasulye,2000,"Comedy, Crime",,105,Bora Tekay,"Selim Erdogan, Elvin Besikcioglu, Bülent Kayabas, Burak Sergen","The duty of taking the tax return forms of the villagers to the big city is assigned to a naive young guy. The villagers trust him, and hand their tax return forms to him. Taking the forms ...",7.090351
1,2,In The Gray,2012,"Crime, Drama",Not Rated,122,Rob Holloway,"George Katt, Lee Arenberg, Chad Lindberg, Martin Klebba","Undercover narcotics officer Yancy is always faced with difficult decision-making when it comes to his work, but the lines become even more blurred when other factors come in to play. His ...",7.083613
2,3,Hellboy Ii: The Golden Army,2008,"Action, Adventure, Fantasy",PG-13,120,Guillermo del Toro,"Ron Perlman, Selma Blair, Doug Jones, John Alexander","A prince of the mythical world starts a rebellion against humanity in order to rule the Earth, and Hellboy his team must fight to stop him from locating the all-powerful Golden Army.",6.99932
3,4,Mutiny On The Buses,1972,Comedy,PG,89,Harry Booth,"Reg Varney, Doris Hare, Michael Robbins, Anna Karen","Bus driver Stan Butler agrees to marry Suzy, much to the anguish of Mum, her son-in-law, Arthur, and daughter Olive. How, they wonder, will they ever manage without Stan's money coming in? Then Arthur is sacked, and Stan agrees to delay the wedding. Meanwhile, he hits on an idea: Arthur should l...",6.322374
4,5,Kadin Isi Banka Soygunu,2014,"Comedy, Drama",,104,A. Taner Elhan,"Filiz Ahmet, Güven Murat Akpinar, Cihan Aksoy, Ferit Aktug","Having left a difficult time with her husband leaving, Gülay returns to her mother's house with her son. On the one hand she struggles with health problems while on the other hand trying to...",5.609774


In [51]:
recommend_byTitle(movie, cossim5, df5)

The Avengers is a duplicate.


Unnamed: 0,Rank,Film,Year,Genre,Rating,Runtime,Director,Also starring,Description,Score
0,1,Green Snake,1993,"Drama, Fantasy",Not Rated,99,Hark Tsui,"Maggie Cheung, Joey Wang, Wenzhuo Zhao, Hsing-Kuo Wu",Two snake spirits become human and learn about love and suffering. A monk discovers his own weaknesses and finds that morality is not as simple as he had thought.,7.032916
1,2,Hellboy Ii: The Golden Army,2008,"Action, Adventure, Fantasy",PG-13,120,Guillermo del Toro,"Ron Perlman, Selma Blair, Doug Jones, John Alexander","A prince of the mythical world starts a rebellion against humanity in order to rule the Earth, and Hellboy his team must fight to stop him from locating the all-powerful Golden Army.",6.998445
2,3,Moondance Alexander,2007,"Drama, Family",G,94,Michael Damian,"Kay Panabaker, Don Johnson, Lori Loughlin, James Best","Moondance (Kay Panabaker) is faced with her father's passing and her overprotective mother (Lori Loughlin). When she finds a lost horse and discovers his abilities, she convinces his owner (Don Johnson) to train them to compete.",6.379051
3,4,Try Seventeen,2002,"Comedy, Drama",R,93,Jeffrey Porter,"Elijah Wood, Franka Potente, Mandy Moore, Chris William Martin",The story of a young man who enters college only to discover that he can learn more about life and love from his neighbors in the apartment building where he lives.,6.086755
4,5,Swift,2019,"Animation, Adventure, Comedy",Not Rated,90,Andrea Block,"Kate Winslet, Willem Dafoe, Josh Keaton, Cassandra Steen",The little swift Manou grows up believing he's a seagull learning to fly he finds out he never will be. When he runs from home he meets birds of his own species and finds out who he really is.,6.061157


In [52]:
df1.to_csv("dfTFIDF", index=False)
df2.to_csv("df100vec", index=False)