In [None]:
import pandas as pd

mvp = pd.read_csv('%s/%s' % ('mvp.csv'),
                  usecols=['id', 'name', 'team', 'league', 'year'],
                  dtype={'id': int, 'name': str, 'team': str, 'league': str, 'year': int})

article_labeled = pd.read_csv('%s/%s' % ('article_labeled.csv'),
                  usecols=['id', 'year', 'text', 'name'],
                  dtype={'id': int, 'year': int, 'text': str, 'name': str})

# Stats: Number of Articles per Year
article_labeled.groupby(['year']).size()

In [None]:
###################################
# 1-4. Word Processing
###################################
import nltk
nltk.download('wordnet') # for lemmatizer
import gensim
stop_words = gensim.parsing.preprocessing.STOPWORDS.union(set(['mlb', 'major', 'league', 'baseball', 'game', 'team']))

MIN_COUNT = 1
THRESHOLD = 1 # smaller: longer phrases (e.g. Los Angeles Angeles)
    
# Create BagOfWords
def create_bow(values):
    # Tokenize Text
    docs_tokenized = [val.split() for val in values]
    
    # Remove Stop Words
    docs_sw_removed = [[token for token in doc if token not in stop_words] for doc in docs_tokenized]
    
    # Lemmatization
    lemmatizer = nltk.stem.WordNetLemmatizer()
    docs_lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs_sw_removed]

    # Snowball English Stemming
    stemmer = nltk.stem.snowball.EnglishStemmer()
    docs_stemmed = [[stemmer.stem(token) for token in doc] for doc in docs_lemmatized]
    return docs_stemmed

###################################
# 1-5. Bigram & Trigram Models
###################################
# Create Bigram Model
def create_bigram_model(docs):
    bigram = gensim.models.Phrases(docs, min_count=MIN_COUNT, threshold=THRESHOLD)
    bigram_model = gensim.models.phrases.Phraser(bigram)
    return bigram_model, bigram

# Create Trigram Model
def create_trigram_model(docs, bigram):
    trigram = gensim.models.Phrases(bigram[docs], min_count=MIN_COUNT, threshold=THRESHOLD)
    trigram_model = gensim.models.phrases.Phraser(trigram)
    return trigram_model

# Create Bigram Trigram Models
def create_bigram_trigram_models(values):
    # BagOfWords
    docs = create_bow(values)
    
    # Bigram Model
    bigram_model, bigram = create_bigram_model(docs)
    
    # Trigram Model
    trigram_model = create_trigram_model(docs, bigram)
    return bigram_model, trigram_model

###################################
# 1-6. Bigram & Trigram Terms
###################################
# Create Bigram Terms
def create_bigram_terms(bigram_model, docs):
    bigram_terms = [bigram_model[doc] for doc in docs]
    return bigram_terms

# Create Trigram Terms
def create_trigram_terms(bigram_model, trigram_model, docs):
    bigram_terms = create_bigram_terms(bigram_model, docs)
    trigram_terms = [trigram_model[doc] for doc in bigram_terms]
    return trigram_terms

# Create Trigram
def create_Trigram(values, bigram_model, trigram_model):
    # BagOfWords
    docs = create_bow(values)
    
    # Trigram Terms
    trigram_terms = create_trigram_terms(bigram_model, trigram_model, docs)
    return trigram_terms

###################################
# 1-7. Dictionary & Corpus Creation
###################################
# Create Dictionary: {TermID: Term} (e.g. {0: 'apple'})
def create_dictionary(trigram_terms):
    dictionary = gensim.corpora.Dictionary(trigram_terms)
    return dictionary

# Create Corpus: (TermID, Frequency) (e.g. [(0, 1), (1, 2), ...])
def create_corpus(trigram_terms, dictionary):
    corpus = [dictionary.doc2bow(doc) for doc in trigram_terms]
    return corpus

# Create Dictionary & Corpus
def create_dict_corpus(trigram_terms):
    dictionary = create_dictionary(trigram_terms)
    corpus = create_corpus(trigram_terms, dictionary)
    return dictionary, corpus, trigram_terms

In [None]:
###################################
# 1-8. lemmatizer & Stemmer Test
###################################
pd.set_option('display.max_columns', None) # None=auto detect

# Lemmatizer
sample_lemmatized1 = ['Lemmatizer'] + [lemmatizer.lemmatize(token) for token in sample_swr1]

# Snowball English Stemmer
snowball = nltk.stem.snowball.EnglishStemmer()

# Porter Stemmer
porter = nltk.stem.porter.PorterStemmer()

# Lancaster Stemmer
lancaster = nltk.stem.lancaster.LancasterStemmer()

In [None]:
###################################
# 1-9. lemmatizer & Stemmer Test1
###################################
sample_text1 = article_labeled.text.values[101].split()
print(' '.join(sample_text))

# Stop Words Removal
sample_swr1 = [token for token in sample_text1 if token not in stop_words]

sample_snowball1 = ['Snowball'] + [snowball.stem(token) for token in sample_swr1]
sample_porter1 = ['Porter'] + [porter.stem(token) for token in sample_swr1]
sample_lancaster1 = ['Lancaster'] + [lancaster.stem(token) for token in sample_swr1]

pd.DataFrame([sample_lemmatized1, sample_snowball1, sample_porter1, sample_lancaster1], columns=[''] + sample_swr1)