In [30]:
import pandas as pd
import numpy as np
import nltk

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

url = 'https://raw.githubusercontent.com/samiriff/datacamp-projects/master/Find%20Movie%20Similarity%20from%20Plot%20Summaries/datasets/movies.csv'

movie = pd.read_csv(url,engine='python')
movie.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfa...","In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daught..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of murdering his wife and her lover and sentenced to ...","In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and ..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the Kraków Ghetto as World War II begins. Oskar Schin...","The relocation of Polish Jews from surrounding areas to Krakow begins in late 1939, shortly afte..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight Italian American, Jake LaMotta (Robert De Niro), ...","The film opens in 1964, where an older and fatter Jake LaMotta (Robert De Niro) practices his st..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate Rick Blaine is the proprietor of an upscale night...,"In the early years of World War II, December 1941, the Moroccan coastal city of Casablanca attra..."


In [31]:
movie['plot'] = movie['wiki_plot'].astype(str) + "\n" + \
                 movie['imdb_plot'].astype(str)

In [32]:
movie.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfa...","In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daught...","On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfa..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of murdering his wife and her lover and sentenced to ...","In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and ...","In 1947, banker Andy Dufresne is convicted of murdering his wife and her lover and sentenced to ..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the Kraków Ghetto as World War II begins. Oskar Schin...","The relocation of Polish Jews from surrounding areas to Krakow begins in late 1939, shortly afte...","In 1939, the Germans move Polish Jews into the Kraków Ghetto as World War II begins. Oskar Schin..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight Italian American, Jake LaMotta (Robert De Niro), ...","The film opens in 1964, where an older and fatter Jake LaMotta (Robert De Niro) practices his st...","In a brief scene in 1964, an aging, overweight Italian American, Jake LaMotta (Robert De Niro), ..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate Rick Blaine is the proprietor of an upscale night...,"In the early years of World War II, December 1941, the Moroccan coastal city of Casablanca attra...",It is early December 1941. American expatriate Rick Blaine is the proprietor of an upscale night...


## Tokenization & Stemming

In [33]:
# Tokenize a paragraph into sentences and store in sent_tokenized
sent_tokenized = [sent for sent in nltk.sent_tokenize("""
                        Today (May 19, 2016) is his only daughter's wedding. 
                        Vito Corleone is the Godfather.
                        """)]

# Word Tokenize first sentence from sent_tokenized, save as words_tokenized
words_tokenized = [word for word in nltk.word_tokenize(sent_tokenized[0])]

# Remove tokens that do not contain any letters from words_tokenized
import re

filtered = [word for word in words_tokenized if re.search('[a-zA-Z]', word)]

# Display filtered words to observe words after tokenization
filtered

['Today', 'May', 'is', 'his', 'only', 'daughter', "'s", 'wedding']

In [34]:
# Import the SnowballStemmer to perform stemming
from nltk.stem.snowball import SnowballStemmer

# Create an English language SnowballStemmer object
stemmer = SnowballStemmer("english")

# Print filtered to observe words without stemming
print("Without stemming: ", filtered)

# Stem the words from filtered and store in stemmed_words
stemmed_words = [stemmer.stem(word) for word in filtered]

# Print the stemmed_words to observe words after stemming
print("After stemming:   ", stemmed_words)

Without stemming:  ['Today', 'May', 'is', 'his', 'only', 'daughter', "'s", 'wedding']
After stemming:    ['today', 'may', 'is', 'his', 'onli', 'daughter', "'s", 'wed']


In [35]:
# Define a funcation to perform both stemming and tokenization
def tokenize_and_stem(text):
    #tokenize by sentence, then by word
    tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    #  filter the noise
    filter_noise=[token for token in tokens if re.search('[a-zA-Z]',token)]
    
    #stem the filiter_tokens
    stems=[stemmer.stem(word) for word in filter_noise ]
    
    return stems

print(tokenize_and_stem("today is my birthday!!!! it's so great be greatful to life"))

['today', 'is', 'my', 'birthday', 'it', "'s", 'so', 'great', 'be', 'great', 'to', 'life']


### Create TfidfVectorizer

In [36]:
# Import TfidfVectorizer to create TF-IDF vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer object with stopwords and tokenizer
# parameters for efficient processing of text
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem,
                                 ngram_range=(1,3))

In [38]:
# Fit and transform the tfidf_vectorizer with the "plot" of each movie
# to create a vector representation of the plot summaries
tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in movie['plot']])

tfidf_matrix.shape


(100, 564)

In [39]:
## K-means-clustering

In [42]:
from sklearn.cluster import KMeans

km=KMeans(n_clusters=5)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

moive['cluster']=clusters

moive.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,plot,cluster
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfa...","In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daught...","On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfa...",4
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of murdering his wife and her lover and sentenced to ...","In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and ...","In 1947, banker Andy Dufresne is convicted of murdering his wife and her lover and sentenced to ...",4
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the Kraków Ghetto as World War II begins. Oskar Schin...","The relocation of Polish Jews from surrounding areas to Krakow begins in late 1939, shortly afte...","In 1939, the Germans move Polish Jews into the Kraków Ghetto as World War II begins. Oskar Schin...",4
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight Italian American, Jake LaMotta (Robert De Niro), ...","The film opens in 1964, where an older and fatter Jake LaMotta (Robert De Niro) practices his st...","In a brief scene in 1964, an aging, overweight Italian American, Jake LaMotta (Robert De Niro), ...",1
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate Rick Blaine is the proprietor of an upscale night...,"In the early years of World War II, December 1941, the Moroccan coastal city of Casablanca attra...",It is early December 1941. American expatriate Rick Blaine is the proprietor of an upscale night...,1


In [43]:
moive['cluster'].value_counts()

1    47
3    22
4    18
2     7
0     6
Name: cluster, dtype: int64

### Cosine-similarity

In [45]:
# Import cosine_similarity to calculate similarity of movie plots
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the similarity distance
similarity_distance = 1 - cosine_similarity(tfidf_matrix)