In [1]:
# LDA model 
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

# Plotting tools
!pip install -U pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline





In [2]:
import joblib
# directories changed to relative directories for the submission folder
fortnite_lemmatised = joblib.load(r"Resources\Resources\fornite_lemmatised.joblib")
csgo_lemmatised = joblib.load(r"Resources\Resources\csgo_lemmatised.joblib")
league_lemmatised = joblib.load(r"Resources\Resources\league_lemmatised.joblib")
warcraft_lemmatised = joblib.load(r"Resources\Resources\warcraft_lemmatised.joblib")

In [4]:
# using a subset of 500k is much less damaging in this case than on the whole data set 
500000/ len(fortnite_lemmatised), 500000/ len(csgo_lemmatised), 500000/ len(league_lemmatised), 500000/ len(warcraft_lemmatised)

(0.45360237397338443,
 0.3155362475419726,
 0.32091787647357467,
 0.1776736057685645)

In [None]:
# Word-Document Matrix
# This converts a collection of text documents to a matrix of token counts. 
# A document-term matrix or term-document matrix is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. 
# In a document-term matrix, rows correspond to documents in the collection and columns correspond to terms. 

fortnite_vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum number occurences of a word required
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}')  # num of characters > 3

csgo_vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum number occurences of a word required
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}')  

league_vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum number occurences of a word required
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}')
warcraft_vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum number occurences of a word required
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}')  



# fit on a sub set of 500k tokens each
# 1/2 fornite data, 1/3 csgo, league, 1/4 warcraft
fortnite_data_vectorized = fortnite_vectorizer.fit_transform(fortnite_lemmatised[:500000])
csgo_data_vectorized = csgo_vectorizer.fit_transform(csgo_lemmatised[:500000])
league_data_vectorized = league_vectorizer.fit_transform(league_lemmatised[:500000])
warcraft_data_vectorized = warcraft_vectorizer.fit_transform(warcraft_lemmatised[:500000])

# coding was done on google collab
joblib.dump(fortnite_vectorizer, '/content/drive/MyDrive/Year 3/Project/Resources/fortnite_vectorizer.joblib')
joblib.dump(csgo_vectorizer, '/content/drive/MyDrive/Year 3/Project/Resources/csgo_vectorizer.joblib')
joblib.dump(league_vectorizer, '/content/drive/MyDrive/Year 3/Project/Resources/league_vectorizer.joblib')  
joblib.dump(warcraft_vectorizer, '/content/drive/MyDrive/Year 3/Project/Resources/warcraft_vectorizer.joblib')

['/content/drive/MyDrive/Year 3/Project/Resources/warcraft_vectorizer.joblib']

In [None]:
# Build LDA model with Sklearn

fortnite_lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,                   # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,              # Random state
                                      batch_size=128)                # n docs in each learning iter

csgo_lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,                   # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,              # Random state
                                      batch_size=128) 

league_lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,                   # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,              # Random state
                                      batch_size=128)  

warcraft_lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,                   # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,              # Random state
                                      batch_size=128)                                            

fortnite_lda_output = fortnite_lda_model.fit_transform(fortnite_data_vectorized)



In [None]:
joblib.dump(fortnite_lda_model, '/content/drive/MyDrive/Year 3/Project/Resources/fortnite_lda_model.joblib')
joblib.dump(fortnite_lda_output, '/content/drive/MyDrive/Year 3/Project/Resources/fortnite_lda_output.joblib')

['/content/drive/MyDrive/Year 3/Project/Resources/fortnite_lda_output.joblib']

In [None]:
csgo_lda_output = csgo_lda_model.fit_transform(csgo_data_vectorized)


In [None]:
joblib.dump(csgo_lda_model, '/content/drive/MyDrive/Year 3/Project/Resources/csgo_lda_model.joblib')
joblib.dump(csgo_lda_output, '/content/drive/MyDrive/Year 3/Project/Resources/csgo_lda_output.joblib')

['/content/drive/MyDrive/Year 3/Project/Resources/csgo_lda_output.joblib']

In [None]:
league_lda_output = league_lda_model.fit_transform(league_data_vectorized)



In [None]:
joblib.dump(league_lda_model, '/content/drive/MyDrive/Year 3/Project/Resources/league_lda_model.joblib' )
joblib.dump(league_lda_output, '/content/drive/MyDrive/Year 3/Project/Resources/league_lda_output.joblib')

['/content/drive/MyDrive/Year 3/Project/Resources/league_lda_output.joblib']

In [None]:
warcraft_lda_output = warcraft_lda_model.fit_transform(warcraft_data_vectorized)


In [None]:
joblib.dump(warcraft_lda_output, '/content/drive/MyDrive/Year 3/Project/Resources/warcraft_lda_output.joblib')
joblib.dump(warcraft_lda_model, '/content/drive/MyDrive/Year 3/Project/Resources/warcraft_lda_model.joblib')

['/content/drive/MyDrive/Year 3/Project/Resources/warcraft_lda_model.joblib']

In [None]:
csgo_lda_output

array([[0.1       , 0.1       , 0.1       , 0.1       , 0.6       ],
       [0.1       , 0.1       , 0.59999999, 0.1       , 0.1       ],
       [0.1       , 0.1       , 0.1       , 0.1       , 0.6       ],
       ...,
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.40000048, 0.06666667, 0.39999951, 0.06666667, 0.06666667],
       [0.30004117, 0.29991536, 0.05000031, 0.0500003 , 0.30004287]])