## Reccomendation system will be content and popularity based

### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

### Data Collection

In [2]:
df = pd.read_csv("movies.csv")
df.head(1)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron


In [3]:
df.shape

(4803, 24)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [5]:
df.describe()

Unnamed: 0,index,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,2401.0,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,1386.651002,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,1200.5,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,2401.0,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,3601.5,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,4802.0,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


#### Feature Selection

In [6]:
 features = ['keywords','genres','tagline','cast','director']
 features

['keywords', 'genres', 'tagline', 'cast', 'director']

#### Replacing the nan values from selected features

In [7]:
for feature in features:
    df[feature] = df[feature].fillna('')

#### Combibning all selected features

In [8]:
comined_features = df['keywords']+''+df['genres']+''+df['tagline']+''+df['cast']+''+df['director']
comined_features

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6Acti...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799    Comedy RomanceA newlywed couple's honeymoon is...
4800    date love at first sight narration investigati...
4801    A New Yorker in ShanghaiDaniel Henney Eliza Co...
4802    obsession camcorder crush dream girlDocumentar...
Length: 4803, dtype: object

so from above result it shows that the selected data or features has 'string' type or text data which cannot be used for further process so we have to convert it into 'int' or numerical data ,look at the below cell for the conversion process.

In [9]:
vectorizor = TfidfVectorizer()
numeric_data = vectorizor.fit_transform(comined_features)

numeric_data

<4803x26295 sparse matrix of type '<class 'numpy.float64'>'
	with 111686 stored elements in Compressed Sparse Row format>

In [10]:
print(numeric_data)

  (0, 3466)	0.16611619087712773
  (0, 19688)	0.26851914409699834
  (0, 15657)	0.1539008504139717
  (0, 13615)	0.2193203637846457
  (0, 22133)	0.158137279883646
  (0, 25274)	0.19255822916196522
  (0, 21344)	0.19813160241831268
  (0, 20374)	0.21130017132513906
  (0, 26258)	0.19385923065061664
  (0, 25897)	0.2302235319492523
  (0, 20409)	0.1531899132330251
  (0, 17535)	0.2561245132609643
  (0, 17136)	0.08521407424479618
  (0, 25877)	0.1220435815997533
  (0, 23039)	0.0737197291180072
  (0, 8575)	0.26851914409699834
  (0, 20717)	0.10357237403043922
  (0, 8230)	0.12851799881041945
  (0, 401)	0.10528296969003903
  (0, 21653)	0.26851914409699834
  (0, 4547)	0.24050912515097875
  (0, 25068)	0.12867516584529845
  (0, 21788)	0.32515067453066626
  (0, 9383)	0.1602758384482496
  (0, 4328)	0.21374699052829824
  :	:
  (4801, 5123)	0.315908227869696
  (4801, 21052)	0.315908227869696
  (4801, 26077)	0.315908227869696
  (4801, 26231)	0.30132615449210415
  (4801, 7457)	0.2618158620242036
  (4801, 650)	0.

#### For Similarity among features so that newly recommendation will be matchable

In [11]:
similarity_data = cosine_similarity(numeric_data)

similarity_data

array([[1.        , 0.06507699, 0.01167579, ..., 0.        , 0.        ,
        0.        ],
       [0.06507699, 1.        , 0.02333335, ..., 0.01168758, 0.        ,
        0.        ],
       [0.01167579, 0.02333335, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.01168758, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [12]:
similarity_data.shape

(4803, 4803)

Now , we are going to take an input as a user that give name of any movie ,so that beside exact movie other related movies will also be shown

In [32]:
name = input('Enter any movie name  :')

In [26]:
list_of_all_movies = df['title'].tolist()
list_of_all_movies

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

now recommended will be

In [33]:
recomended = difflib.get_close_matches(name,list_of_all_movies)
recomended

['Iron Man', 'Iron Man 3', 'Iron Man 2']

mention the closest match

In [34]:
 closest_match = recomended[0]
 closest_match

'Iron Man'

for suggesting or recommending the name or title of the movie we first access the indexes of all movies according to title and also find the simillarity score of each one

so first access the index of the closest match

In [36]:
indexes = df[df.title == closest_match]['index'].values[0]
indexes

68

now we will get the list of all movies sith similarity scrore

In [39]:
similarity_score = list(enumerate(similarity_data[indexes]))
similarity_score


[(0, 0.010270217723167297),
 (1, 0.03388715928584817),
 (2, 0.0),
 (3, 0.0),
 (4, 0.010141012220039722),
 (5, 0.0),
 (6, 0.05126862760478511),
 (7, 0.17471646354948867),
 (8, 0.0),
 (9, 0.029466094568726477),
 (10, 0.011749626026285894),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.039678985588738454),
 (15, 0.0),
 (16, 0.16986405227452145),
 (17, 0.0),
 (18, 0.009889189136747046),
 (19, 0.06416055344793857),
 (20, 0.06170763248466677),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.15173640954840845),
 (27, 0.009517513075016588),
 (28, 0.05210030199396191),
 (29, 0.0),
 (30, 0.06234828713115115),
 (31, 0.2110893110342683),
 (32, 0.020010920926046356),
 (33, 0.11723335382279096),
 (34, 0.0),
 (35, 0.015302948072133345),
 (36, 0.014389566111913776),
 (37, 0.0),
 (38, 0.09317494131383655),
 (39, 0.06996885696471665),
 (40, 0.0),
 (41, 0.009867499046892294),
 (42, 0.0),
 (43, 0.025849073813591777),
 (44, 0.0),
 (45, 0.02327256865486361),
 (46, 0.11552351455455223),
 (4

check the length

In [40]:
len(similarity_score)

4803

now we must sort the data so that more similar will be on top for recommendation

In [45]:
sorted_most_scorer = sorted(similarity_score, key=lambda x:x[1] , reverse=True)
sorted_most_scorer

[(68, 1.0000000000000002),
 (79, 0.2813695275403554),
 (31, 0.2110893110342683),
 (7, 0.17471646354948867),
 (16, 0.16986405227452145),
 (85, 0.15720854613421684),
 (182, 0.15516047320918902),
 (101, 0.15318410060540139),
 (26, 0.15173640954840845),
 (33, 0.11723335382279096),
 (46, 0.11552351455455223),
 (4401, 0.11440605778166893),
 (203, 0.10836790937851089),
 (353, 0.10735737276801674),
 (511, 0.1064592479167957),
 (3623, 0.10602107093035937),
 (954, 0.10032534971768903),
 (2235, 0.10008489624311405),
 (1210, 0.09889313849830489),
 (174, 0.09446733371994838),
 (38, 0.09317494131383655),
 (64, 0.09229429155799371),
 (3166, 0.0922728815854815),
 (1406, 0.09131473457067929),
 (2186, 0.08892768220221951),
 (882, 0.08862925069760247),
 (126, 0.08705862386400914),
 (788, 0.08608100366605693),
 (3443, 0.08439401887910367),
 (2063, 0.08408500486984247),
 (94, 0.08296354219970242),
 (940, 0.08201103745047902),
 (1956, 0.07976820037117358),
 (2411, 0.07946189183346977),
 (3385, 0.07943419237

as we are access all the movies that has major to minor similartiy in the form of indexes and scores in numeric ,now we should have to recommend the name in this manner,so we access the title

In [64]:
print("Top 20 Movies recommended for you the most similar to minute similar on your search or search history \n")

i=1
for movie in sorted_most_scorer:
    index_value = movie[0]
    title_from_indexes = df[df.index==index_value]['title'].values[0]
    
    if(i<21):
        print(i,'.',title_from_indexes)
        i+=1

    

Top 20 Movies recommended for you the most similar to minute similar on your search or search history 

1 . Iron Man
2 . Iron Man 2
3 . Iron Man 3
4 . Avengers: Age of Ultron
5 . The Avengers
6 . Captain America: The Winter Soldier
7 . Ant-Man
8 . X-Men: First Class
9 . Captain America: Civil War
10 . X-Men: The Last Stand
11 . X-Men: Days of Future Past
12 . The Helix... Loaded
13 . X2
14 . Tropic Thunder
15 . X-Men
16 . Made
17 . The Judge
18 . A Scanner Darkly
19 . Gothika
20 . The Incredible Hulk


### Set a Recommendation System 

In [74]:
name = input('Enter any movie name  :')
list_of_all_movies = df['title'].tolist()
recomended = difflib.get_close_matches(name,list_of_all_movies)
closest_match = recomended[0]
indexes = df[df.title == closest_match]['index'].values[0]
similarity_score = list(enumerate(similarity_data[indexes]))
sorted_most_scorer = sorted(similarity_score, key=lambda x:x[1] , reverse=True)

print("Top 25 Movies recommended for you the most similar to minute similar on your search or search history \n")

i=1
for movie in sorted_most_scorer:
    index_value = movie[0]
    title_from_indexes = df[df.index==index_value]['title'].values[0]
    
    if(i<26):
        print(i,'.',title_from_indexes)
        i+=1


Top 25 Movies recommended for you the most similar to minute similar on your search or search history 

1 . Avatar
2 . Alien
3 . Guardians of the Galaxy
4 . Galaxy Quest
5 . Cargo
6 . Trekkies
7 . Moonraker
8 . Space Dogs
9 . Event Horizon
10 . Pocahontas
11 . Clash of the Titans
12 . The Right Stuff
13 . Aliens
14 . Alien³
15 . Star Trek Beyond
16 . The Astronaut's Wife
17 . Star Wars: Clone Wars: Volume 1
18 . The Book of Life
19 . Star Trek Into Darkness
20 . Terminator Salvation
21 . Imaginary Heroes
22 . Space Chimps
23 . A LEGO Brickumentary
24 . Planet of the Apes
25 . Out of the Furnace
