# 1) Packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re, string

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.metrics.pairwise import cosine_similarity

bold = "\033[1m"

# 2) Load Data

In [2]:
episodes = pd.read_csv("../input/south-park-scripts-dataset/SouthPark_Episodes.csv")

episodes.head()

Unnamed: 0,Title,Air Date,Code,#,Description,Season,Episode
0,Cartman Gets an Anal Probe,"August 13, 1997",101,1,"While the boys are waiting for the school bus,...",1,1
1,Weight Gain 4000,"August 20, 1997",102,2,When Cartman's environmental essay wins a nati...,1,2
2,Volcano,"August 27, 1997",103,3,A weekend trip to experience the finer points ...,1,3
3,Big Gay Al's Big Gay Boat Ride,"September 3, 1997",104,4,"When Stan discovers his new dog Sparky is gay,...",1,4
4,An Elephant Makes Love to a Pig,"September 10, 1997",105,5,"Kyle's mom won't let him keep his new pet, an ...",1,5


In [3]:
lines = pd.read_csv("../input/south-park-scripts-dataset/SouthPark_Lines.csv")

lines = lines[lines["Line"].notnull()]
lines

Unnamed: 0,Title,Character,Line
0,Cartman Gets an Anal Probe,Scene Description,At the bus stop.
1,Cartman Gets an Anal Probe,The Boys,"School days, school days, teacher's golden ru..."
2,Cartman Gets an Anal Probe,Kyle Broflovski,"Ah, damn it! My little brother's trying to fol..."
3,Cartman Gets an Anal Probe,Ike Broflovski,Eat banana.
4,Cartman Gets an Anal Probe,Kyle,"Ike, you can't come to school with me. [Ike Ch..."
...,...,...,...
95315,South ParQ Vaccination Special,Cartman,[turns to face Clyde] Yeah?
95316,South ParQ Vaccination Special,Clyde,[standing with Jimmy and an unnamed Hispanic k...
95317,South ParQ Vaccination Special,Cartman,[he takes off his mask] Casa Bonita?! Hell yea...
95318,South ParQ Vaccination Special,Scene Description,Fourth grade classroom. Mr. Garrison opens the...


# 3) Joing Scripts and Characters

In [4]:
episodes_list = episodes.Title.tolist()

script = {}

for episode in episodes_list:
    
    temp = lines[lines["Title"] == episode]
    episode_script = " ".join(temp.Line)
    script[episode] = episode_script
    
full_script_df = pd.DataFrame(script.items(), columns = ["Title", "Script"])
full_script_df

Unnamed: 0,Title,Script
0,Cartman Gets an Anal Probe,"At the bus stop. School days, school days, tea..."
1,Weight Gain 4000,"Mr. Garrison's classroom. Hey Stan, did you se..."
2,Volcano,"In front of Cartman's house. Jimbo, Ned, and t..."
3,Big Gay Al's Big Gay Boat Ride,"Bus Stop. Hey, where's the school bus? We're g..."
4,An Elephant Makes Love to a Pig,"Bus Stop. Hey Stan, where'd you get that black..."
...,...,...
304,Turd Burglars,"Park County Community Center, day. Inside, a m..."
305,Basic Cable,"The episode begins with a new opener for ""The ..."
306,Christmas Snow,The season ends with a regular introduction. T...
307,The Pandemic Special,The episode begins with a bleak landscape. Eve...


In [5]:
characters = {}

for episode in episodes_list:
    temp = lines[lines["Title"] == episode]
    episode_characters = " ".join(temp.Character) 
    characters[episode] = episode_characters
    
full_characters_df = pd.DataFrame(characters.items(), columns = ["Title", "Characters"])
full_characters_df

Unnamed: 0,Title,Characters
0,Cartman Gets an Anal Probe,Scene Description The Boys Kyle Broflovski Ike...
1,Weight Gain 4000,Scene Description Kyle Stan Cartman Kyle Stan ...
2,Volcano,Scene Description Liane Cartman Kyle Stan Jimb...
3,Big Gay Al's Big Gay Boat Ride,Scene Description Kyle Scene Description Stan ...
4,An Elephant Makes Love to a Pig,Scene Description Cartman Stan Cartman Stan Ca...
...,...,...
304,Turd Burglars,Scene Description Sheila Mrs. Testaburger Shei...
305,Basic Cable,Scene Description Scott Scott's Mom Scott Scot...
306,Christmas Snow,Scene Description Mayor McDaniels The boys San...
307,The Pandemic Special,Scene Description Butters Stephen Butters Step...


# 4) Vectorizing Characters, Scripts and Descriptions

For similarity, we don't have to use all characters. Setting a minimum line limit helps us for speeding our calculations and getting better results.

In [6]:
cast = lines.Character.value_counts()[lines.Character.value_counts() > 50].index.tolist()

cv = CountVectorizer(lowercase = False)

characters = cv.fit_transform(full_characters_df["Characters"])
characters_cv = pd.DataFrame(characters.todense(), columns = cv.get_feature_names())

characters_cv.set_index(full_characters_df.Title, inplace = True)

characters_cv = characters_cv[[x for x in characters_cv.columns if x in cast]]

characters_cv

Unnamed: 0_level_0,Agent,All,Anchor,Announcer,Barbrady,Bebe,Bill,Bob,Boy,Boys,...,Terrance,Thomas,Timmy,Token,Tom,Towelie,Tweek,Wendy,Woman,Yates
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cartman Gets an Anal Probe,0,0,0,0,9,0,0,0,0,4,...,0,0,0,0,0,0,0,21,0,0
Weight Gain 4000,2,0,0,3,9,3,0,0,0,0,...,0,0,0,0,0,0,0,30,0,0
Volcano,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Big Gay Al's Big Gay Boat Ride,0,0,0,2,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
An Elephant Makes Love to a Pig,0,0,0,0,2,1,4,0,0,1,...,16,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Turd Burglars,0,1,0,2,0,0,0,0,0,0,...,0,0,0,0,10,0,0,0,0,5
Basic Cable,0,0,0,4,0,0,1,1,1,1,...,0,0,2,0,0,0,0,0,1,0
Christmas Snow,0,2,0,3,0,0,0,3,0,0,...,0,2,0,0,0,7,1,0,1,0
The Pandemic Special,0,0,7,0,0,0,4,0,0,0,...,0,4,1,1,7,0,1,1,1,15


In [7]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "can not",
"can't've": "can not have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have",
"wanna": "want to",
"gonna": "going to",
"gotta": "have got to"
}

In [8]:
all_stopwords = nlp.Defaults.stop_words

def tokenizer(text):
    
    text = text.replace("in'", "ing")
    text = text.replace("m'kay", "mkay")
    tokens = text.split()
    tokens = [re.sub(token, contractions[token], token) if token in contractions.keys() else token for token in tokens]
    tokens = [token.strip(string.punctuation) for token in tokens]    
    tokens = [token.lower() for token in tokens]    
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 1]
    tokens = [token for token in tokens if token not in all_stopwords]

#     n = lambda pos: pos[:2].startswith("N")
#     tokens = [word for (word, pos) in nltk.pos_tag(tokens) if n(pos)] 
    
    lemmatizer = nltk.wordnet.WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token, "v")  if token != "butters" else "butters" for token in tokens]
    lemmas = [lemmatizer.lemmatize(token)  if token != "butters" else "butters" for token in lemmas]
    
    return lemmas

In [9]:
tfidf_script = TfidfVectorizer(min_df = 3, max_df = 0.8, tokenizer = tokenizer, ngram_range = (1, 3), max_features = 5000,
#                         binary = True, use_idf = False, norm = None
                       )

tfidf_script_matrix = tfidf_script.fit_transform(full_script_df["Script"])
tfidf_script_df = pd.DataFrame(tfidf_script_matrix.todense(), columns = tfidf_script.get_feature_names())

tfidf_script_df.set_index(full_script_df.Title, inplace = True)

tfidf_script_df

Unnamed: 0_level_0,aaaa,aaaaa,aaaaah,aaaah,aaah,aaand,aah,aback,abandon,ability,...,younger,yuh,yum,yummy,yummy yummy,yup,zero,zip,zombie,zoom
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cartman Gets an Anal Probe,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.007332
Weight Gain 4000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
Volcano,0.0,0.0,0.047733,0.000000,0.000000,0.000000,0.013608,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
Big Gay Al's Big Gay Boat Ride,0.0,0.0,0.044396,0.000000,0.013861,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
An Elephant Makes Love to a Pig,0.0,0.0,0.010298,0.008345,0.000000,0.000000,0.008807,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.010103,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Turd Burglars,0.0,0.0,0.016591,0.000000,0.012949,0.000000,0.000000,0.0,0.000000,0.0,...,0.017877,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.010707
Basic Cable,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
Christmas Snow,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
The Pandemic Special,0.0,0.0,0.000000,0.007068,0.000000,0.009087,0.000000,0.0,0.009087,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.010015,0.000000,0.000000,0.045034


In [10]:
tfidf_description = TfidfVectorizer(min_df = 2, max_df = 0.8, tokenizer = tokenizer, ngram_range = (1, 3), max_features = 5000,
#                         binary = True, use_idf = False, norm = None
                       )

tfidf_description_matrix = tfidf_description.fit_transform(episodes["Description"])
tfidf_description_df = pd.DataFrame(tfidf_description_matrix.todense(), columns = tfidf_description.get_feature_names())

tfidf_description_df.set_index(episodes.Title, inplace = True)

tfidf_description_df

Unnamed: 0_level_0,access,accident,accidentally,activity,addict,adult,adventure,advice,afraid,agree,...,wok,woman,word,work,world,worry,wrestle,write,wrong,year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cartman Gets an Anal Probe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Weight Gain 4000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Volcano,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Big Gay Al's Big Gay Boat Ride,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
An Elephant Makes Love to a Pig,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Turd Burglars,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Basic Cable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.244355,0.0,0.0,0.0,0.0,0.0,0.000000
Christmas Snow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.239452
The Pandemic Special,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000


If we don't scale, characters_cv frame will be more important. Other features, scripts and descriptions will not affect so much.

In [11]:
scaler = MinMaxScaler()

for col in characters_cv.columns:
    characters_cv[col] = scaler.fit_transform(characters_cv[col].values.reshape(-1, 1))

In [12]:
train = pd.concat([characters_cv, tfidf_script_df, tfidf_description_df], axis = 1)
train

Unnamed: 0_level_0,Agent,All,Anchor,Announcer,Barbrady,Bebe,Bill,Bob,Boy,Boys,...,wok,woman,word,work,world,worry,wrestle,write,wrong,year
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cartman Gets an Anal Probe,0.000000,0.000000,0.000,0.000000,0.109756,0.000000,0.000000,0.000000,0.000000,0.444444,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Weight Gain 4000,0.058824,0.000000,0.000,0.176471,0.109756,0.057692,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Volcano,0.000000,0.000000,0.000,0.000000,0.048780,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Big Gay Al's Big Gay Boat Ride,0.000000,0.000000,0.000,0.117647,0.000000,0.000000,0.107143,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
An Elephant Makes Love to a Pig,0.000000,0.000000,0.000,0.000000,0.024390,0.019231,0.142857,0.000000,0.000000,0.111111,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Turd Burglars,0.000000,0.142857,0.000,0.117647,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
Basic Cable,0.000000,0.000000,0.000,0.235294,0.000000,0.000000,0.035714,0.034483,0.052632,0.111111,...,0.0,0.0,0.0,0.244355,0.0,0.0,0.0,0.0,0.0,0.000000
Christmas Snow,0.000000,0.285714,0.000,0.176471,0.000000,0.000000,0.000000,0.103448,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.239452
The Pandemic Special,0.000000,0.000000,0.875,0.000000,0.000000,0.000000,0.142857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000


# 5) Recommend Episodes, or Find Similar Episodes

In [13]:
cosine_sim = cosine_similarity(train) #Recommendations for using characters, lines, and description of episodes
cosine_sim_chars = cosine_similarity(characters_cv) #Recommendations for using characters number of lines of episodes
cosine_sim_lines = cosine_similarity(tfidf_script_df) #Recommendations for using script of that episode
cosine_sim_description = cosine_similarity(tfidf_description_df) #Recommendations for using description of that episode

indices = pd.Series(range(0, len(train.index)), index = train.index).drop_duplicates()

In [14]:
def get_recommendations(title, cosine_sim = cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    recommendations = pd.DataFrame({"Episodes": train.iloc[movie_indices].index.tolist(),
                                    "Similarity": [sim[1] for sim in sim_scores]})
    return recommendations

**Recommendations for just using characters that have lines:**

In [15]:
get_recommendations("Mr. Hankey, the Christmas Poo", cosine_sim_chars).head(5)

Unnamed: 0,Episodes,Similarity
0,Chef's Chocolate Salty Balls,0.761548
1,The Entity,0.730565
2,Butt Out,0.724521
3,Cartmanland,0.672767
4,Cartoon Wars Part I,0.654744


**Chef's Chocolate Salty Balls** could be a good recommendation, but others... They would better.

**Recommendations for using script of that episode:**

In [16]:
get_recommendations("Mr. Hankey, the Christmas Poo", cosine_sim_lines).head(5)

Unnamed: 0,Episodes,Similarity
0,A Very Crappy Christmas,0.750676
1,Chef's Chocolate Salty Balls,0.651348
2,The Problem with a Poo,0.648276
3,Merry Christmas Charlie Manson!,0.540968
4,Mr. Hankey's Christmas Classics,0.499283


Nice, all recommendations are good for Mr. Hankey lovers.

**Recommendations for using description of that episode:**

In [17]:
get_recommendations("Mr. Hankey, the Christmas Poo", cosine_sim_description).head(5)

Unnamed: 0,Episodes,Similarity
0,Sarcastaball,0.678944
1,Stunning and Brave,0.579862
2,Lice Capades,0.426144
3,1%,0.425262
4,The F Word,0.425208


It looks like we are fail. We get irrelevant recommendations.

**Recommendations for using characters, lines, and description of episode:**

In [18]:
get_recommendations("Mr. Hankey, the Christmas Poo").head(5)

Unnamed: 0,Episodes,Similarity
0,Chef's Chocolate Salty Balls,0.489343
1,A Very Crappy Christmas,0.429401
2,Mr. Hankey's Christmas Classics,0.397528
3,The Entity,0.350728
4,Stunning and Brave,0.310066


Concatenating all features gives good result. However recommendations with using lines could be better.

In [19]:
print(bold + "With using all features: \n")
display(get_recommendations("Terrance and Phillip: Behind the Blow").head(5))

print(bold + "With using just scripts: \n")
display(get_recommendations("Terrance and Phillip: Behind the Blow", cosine_sim_lines).head(5))

[1mWith using all features: 



Unnamed: 0,Episodes,Similarity
0,Terrance and Phillip in Not Without My Anus,0.550914
1,Freemium Isn't Free,0.456144
2,Death,0.433704
3,Cow Days,0.410009
4,"Eat, Pray, Queef",0.317939


[1mWith using just scripts: 



Unnamed: 0,Episodes,Similarity
0,Terrance and Phillip in Not Without My Anus,0.775028
1,SUPER HARD PCness,0.550337
2,Death,0.54291
3,"Eat, Pray, Queef",0.445814
4,Freemium Isn't Free,0.414382


In [20]:
print(bold + "With using all features: \n")
display(get_recommendations("Tweek x Craig").head(5))

print(bold + "With using just scripts: \n")
display(get_recommendations("Tweek x Craig", cosine_sim_lines).head(5))

[1mWith using all features: 



Unnamed: 0,Episodes,Similarity
0,Put It Down,0.55533
1,Tweek vs. Craig,0.552689
2,Child Abduction is Not Funny,0.349223
3,Gnomes,0.27897
4,Free Hat,0.250526


[1mWith using just scripts: 



Unnamed: 0,Episodes,Similarity
0,Tweek vs. Craig,0.642528
1,Put It Down,0.447581
2,Child Abduction is Not Funny,0.363301
3,Gnomes,0.197474
4,Free Hat,0.191913


In [21]:
print(bold + "With using all features: \n")
display(get_recommendations("Skank Hunt").head(5))

print(bold + "With using just scripts: \n")
display(get_recommendations("Skank Hunt", cosine_sim_lines).head(5))

[1mWith using all features: 



Unnamed: 0,Episodes,Similarity
0,Wieners Out,0.314394
1,The Damned,0.298339
2,W.T.F.,0.235703
3,Members Only,0.232773
4,Douche and a Danish,0.230284


[1mWith using just scripts: 



Unnamed: 0,Episodes,Similarity
0,The Damned,0.589578
1,Wieners Out,0.521751
2,Douche and a Danish,0.44429
3,Fort Collins,0.425838
4,Members Only,0.377173


In [22]:
get_recommendations("Butters' Very Own Episode")

Unnamed: 0,Episodes,Similarity
0,The Death of Eric Cartman,0.376835
1,AWESOM-O,0.341609
2,City Sushi,0.309612
3,Raisins,0.283415
4,Professor Chaos,0.276821
5,Grounded Vindaloop,0.272438
6,Marjorine,0.27235
7,Butters' Bottom Bitch,0.264894
8,Cartman Sucks,0.261537
9,Jared Has Aides,0.25533


Generally Butters is main character of that episodes.