In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ast import literal_eval
#ast.literal_eval()は文字列をリストや辞書に変換するモジュール

#自然言語処理
import nltk
import string
%matplotlib inline
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tag import pos_tag
import string

df2 = pd.read_csv('../data_set/tmdb_5000_movies.csv')
df1 = pd.read_csv('../data_set/tmdb_5000_credits.csv')

df1.columns= ['id','title','cast','crew']
df2 = df2.merge(df1,on = 'id')

features = ["cast","crew","keywords","genres"]
for feature in features:
    df2[feature]= df2[feature].apply(literal_eval)

def get_director(x):
    for i in x :
        if i["job"] == "Director":
            return i["name"]
    return np.nan

def get_list(x):
    if isinstance(x,list):
        names = [i["name"] for i in x]

        if len(names)>3:
            names= names[:3]
        return names

    return[]

df2["director"] = df2["crew"].apply(get_director)

features = ["cast","keywords","genres"]
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

df = df2.rename({'title_x':'title'},axis = 1)
df = df[['title','genres','director','overview']]
## トークン化
df['_overview'] = df['overview'].astype(str).str.lower()


df['text_string'] = [text.translate(str.maketrans('', '', string.punctuation)) for text in df['_overview']]

In [2]:
import re 
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [3]:
df['clean_text'] =[clean_text(text) for text in df['text_string']]

In [4]:
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [27]:
df.query("title == 'Avatar'")

Unnamed: 0,title,genres,director,overview,_overview,text_string,clean_text,text_lemmatize,text_stopword,text_clean
0,Avatar,"[Action, Adventure, Fantasy]",James Cameron,"In the 22nd century, a paraplegic Marine is di...","in the 22nd century, a paraplegic marine is di...",in the 22nd century a paraplegic marine is dis...,in the 22nd century a paraplegic marine is dis...,"[in, the, 22nd, century, a, paraplegic, marine...","[22nd, century, paraplegic, marine, dispatch, ...",22nd century paraplegic marine dispatch moon p...


In [24]:
df.size,df.shape

(48030, (4803, 10))

In [5]:
df['text_lemmatize'] = [lemmatize_sentence(text) for text in df['clean_text']]

In [6]:
df['text_lemmatize'].head()

0    [in, the, 22nd, century, a, paraplegic, marine...
1    [captain, barbossa, long, believe, to, be, dea...
2    [a, cryptic, message, from, bond, s, past, sen...
3    [follow, the, death, of, district, attorney, h...
4    [john, carter, be, a, warweary, former, milita...
Name: text_lemmatize, dtype: object

In [7]:
from nltk.corpus import stopwords

stopwords = nltk.corpus.stopwords.words("english")

df['text_stopword'] = df['text_lemmatize'].apply(lambda x: [item for item in x if item not in stopwords])

In [8]:
df['text_stopword'].head()

0    [22nd, century, paraplegic, marine, dispatch, ...
1    [captain, barbossa, long, believe, dead, come,...
2    [cryptic, message, bond, past, sends, trail, u...
3    [follow, death, district, attorney, harvey, de...
4    [john, carter, warweary, former, military, cap...
Name: text_stopword, dtype: object

## one-hot 表現化

In [9]:
# from sklearn.preprocessing import MultiLabelBinarizer

# multilabel_binarizer = MultiLabelBinarizer()
# multilabel_binarizer.fit_transform(df['genres'])

# y = multilabel_binarizer.transform(df['genres'])

# for idx, genre in enumerate(multilabel_binarizer.classes_):
#   df[genre] = y[:,idx]

In [10]:
df['text_clean'] = df['text_stopword'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
df['text_clean'].head()

0    22nd century paraplegic marine dispatch moon p...
1    captain barbossa long believe dead come back l...
2    cryptic message bond past sends trail uncover ...
3    follow death district attorney harvey dent bat...
4    john carter warweary former military captain i...
Name: text_clean, dtype: object

In [11]:
df.columns

Index(['title', 'genres', 'director', 'overview', '_overview', 'text_string',
       'clean_text', 'text_lemmatize', 'text_stopword', 'text_clean'],
      dtype='object')

In [12]:
_df = df.drop(['text_lemmatize','_overview','text_string','clean_text'],axis = 1)

In [13]:
_df.to_csv('data_set.csv',index=0)

In [14]:
_df.head()

Unnamed: 0,title,genres,director,overview,text_stopword,text_clean
0,Avatar,"[Action, Adventure, Fantasy]",James Cameron,"In the 22nd century, a paraplegic Marine is di...","[22nd, century, paraplegic, marine, dispatch, ...",22nd century paraplegic marine dispatch moon p...
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]",Gore Verbinski,"Captain Barbossa, long believed to be dead, ha...","[captain, barbossa, long, believe, dead, come,...",captain barbossa long believe dead come back l...
2,Spectre,"[Action, Adventure, Crime]",Sam Mendes,A cryptic message from Bond’s past sends him o...,"[cryptic, message, bond, past, sends, trail, u...",cryptic message bond past sends trail uncover ...
3,The Dark Knight Rises,"[Action, Crime, Drama]",Christopher Nolan,Following the death of District Attorney Harve...,"[follow, death, district, attorney, harvey, de...",follow death district attorney harvey dent bat...
4,John Carter,"[Action, Adventure, Science Fiction]",Andrew Stanton,"John Carter is a war-weary, former military ca...","[john, carter, warweary, former, military, cap...",john carter warweary former military captain i...
