## Imports

In [112]:
from __future__ import print_function

import pandas as pd
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Part 1: Topic Modeling Visualization

### Fetch 20NewsGroups dataset from sklearn. 
Removing headers,footers and quotes.
Selecting only ```sci.med, sci.space, talk.politics.guns``` categories

In [2]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'),
                                categories=('sci.med', 'sci.space', 'talk.politics.guns'))
docs_raw = newsgroups.data
print(len(docs_raw))

1733


### Creating Normal Bag of Words using CountVectorizer

In [3]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(1733, 2589)


### Creating TFIDF Bag of Words using TFIDFVectorizer

In [4]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)



(1733, 2589)


### Creating LDA Model for Normal and TFIDF Bag of words

In [11]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=3, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

### Visualizing LDA fitted using Normal Bag of Words

In [12]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

### Visualizing LDA fitted using TFIDF Bag of Words

In [13]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

### Visualizing LDA fitted using Normal Bag of Words and Metric Multi Dimensional Scaling

In [17]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')

### Visualizing LDA fitted using Normal Bag of Words and TSNE Multi Dimensional Scaling

In [15]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')

### Visualizing LDA fitted using TFIDF Bag of Words and TSNE Multi Dimensional Scaling

In [18]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='tsne')

# Part 2: Associate Rule Mining

### Creating Dataframe of Ratings_small dataset and filtering with rating > 3.0

In [103]:
df_ratings_small = pd.read_csv('ratings_small.csv')
df_ratings_small = df_ratings_small[df_ratings_small.rating.gt(3.0)]
df_ratings_small

Unnamed: 0,userId,movieId,rating,timestamp
4,1,1172,4.0,1260759205
8,1,1339,3.5,1260759125
12,1,1953,4.0,1260759191
13,1,2105,4.0,1260759139
20,2,10,4.0,835355493
...,...,...,...,...
99996,671,5991,4.5,1064245387
99997,671,5995,4.0,1066793014
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363


### Dropping columns rating and timestamp

In [105]:
df_ratings_small = df_ratings_small.drop(columns=['rating','timestamp'])
df_ratings_small

Unnamed: 0,userId,movieId
4,1,1172
8,1,1339
12,1,1953
13,1,2105
20,2,10
...,...,...
99996,671,5991
99997,671,5995
100000,671,6269
100001,671,6365


### Grouping movieId with respect to userId and Creating movie_list from dataframe

In [108]:
df_movie_list = df_ratings_small.groupby('userId')['movieId'].apply(list)
movie_list = df_movie_list.tolist()

### Converting categorical movie_list to One Hot Encoded dataframe

In [109]:
te = TransactionEncoder()
te_ary = te.fit(movie_list).transform(movie_list)
df = pd.DataFrame(te_ary, columns=te.columns_)

### Observing time taken for different support values from 0.05 to 0.15. Feature itemset generation takes more time more smaller support values. As min_support increases, feature itemsets size decrease and time taken for generation is greatly reduced.

In [137]:
from time import time
supports = [0.05,0.06,0.07,0.08,0.09,0.1,0.15]
for support in supports:
  start = time()
  frequent_itemsets = apriori(df, min_support=support, use_colnames=True)
  print(f'Number of items in frequent_itemsets with min_support {support} : {len(frequent_itemsets)}')
  end = time()
  print(f'Time taken to generate frequent itemsets with min_support : {support} = {end-start}')

 Number of items in frequent_itemsets with min_support 0.05 : 106254
Time taken to generate frequent itemsets with min_support : 0.05 = 153.1415662765503
 Number of items in frequent_itemsets with min_support 0.06 : 29116
Time taken to generate frequent itemsets with min_support : 0.06 = 36.43545937538147
 Number of items in frequent_itemsets with min_support 0.07 : 12063
Time taken to generate frequent itemsets with min_support : 0.07 = 11.553513526916504
 Number of items in frequent_itemsets with min_support 0.08 : 5140
Time taken to generate frequent itemsets with min_support : 0.08 = 4.686921119689941
 Number of items in frequent_itemsets with min_support 0.09 : 2596
Time taken to generate frequent itemsets with min_support : 0.09 = 2.019066572189331
 Number of items in frequent_itemsets with min_support 0.1 : 1432
Time taken to generate frequent itemsets with min_support : 0.1 = 1.0423383712768555
 Number of items in frequent_itemsets with min_support 0.15 : 195
Time taken to gene

### Generating association rules for Feature Itemset with 0.15 min_support with different confidence levels between 0.5 to 0.9

In [139]:
confidence_levels = [0.5,0.6,0.7,0.8,0.9]
for level in confidence_levels:
  display(association_rules(frequent_itemsets, metric="confidence", min_threshold=level))

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1),(296),0.271237,0.400894,0.150522,0.554945,1.384268,0.041784,1.346139
1,(1),(318),0.271237,0.426230,0.154993,0.571429,1.340659,0.039383,1.338798
2,(1),(356),0.271237,0.406855,0.156483,0.576923,1.418005,0.046129,1.401978
3,(47),(296),0.242921,0.400894,0.196721,0.809816,2.020024,0.099336,3.150137
4,(47),(318),0.242921,0.426230,0.159463,0.656442,1.540113,0.055923,1.670082
...,...,...,...,...,...,...,...,...,...
229,"(1196, 1198)","(1210, 260)",0.211624,0.223547,0.152012,0.718310,3.213239,0.104704,2.756408
230,"(260, 1198)","(1210, 1196)",0.220566,0.216095,0.152012,0.689189,3.189282,0.104349,2.522128
231,(1210),"(1196, 260, 1198)",0.265276,0.187779,0.152012,0.573034,3.051632,0.102199,1.902306
232,(1196),"(1210, 260, 1198)",0.299553,0.160954,0.152012,0.507463,3.152847,0.103798,1.703518


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(47),(296),0.242921,0.400894,0.196721,0.809816,2.020024,0.099336,3.150137
1,(47),(318),0.242921,0.426230,0.159463,0.656442,1.540113,0.055923,1.670082
2,(47),(356),0.242921,0.406855,0.157973,0.650307,1.598373,0.059139,1.696185
3,(47),(593),0.242921,0.385991,0.166915,0.687117,1.780136,0.073150,1.962421
4,(50),(296),0.274218,0.400894,0.204173,0.744565,1.857261,0.094241,2.345436
...,...,...,...,...,...,...,...,...,...
146,"(1210, 260)","(1196, 1198)",0.223547,0.211624,0.152012,0.680000,3.213239,0.104704,2.463674
147,"(1210, 1198)","(1196, 260)",0.175857,0.251863,0.152012,0.864407,3.432053,0.107720,5.517511
148,"(1196, 260)","(1210, 1198)",0.251863,0.175857,0.152012,0.603550,3.432053,0.107720,2.078809
149,"(1196, 1198)","(1210, 260)",0.211624,0.223547,0.152012,0.718310,3.213239,0.104704,2.756408


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(47),(296),0.242921,0.400894,0.196721,0.809816,2.020024,0.099336,3.150137
1,(50),(296),0.274218,0.400894,0.204173,0.744565,1.857261,0.094241,2.345436
2,(50),(318),0.274218,0.426230,0.192250,0.701087,1.644858,0.075371,1.919523
3,(110),(356),0.251863,0.406855,0.183308,0.727811,1.788868,0.080837,2.179162
4,(1196),(260),0.299553,0.368107,0.251863,0.840796,2.284106,0.141595,3.969076
...,...,...,...,...,...,...,...,...,...
69,"(1210, 260, 1198)",(1196),0.160954,0.299553,0.152012,0.944444,3.152847,0.103798,12.608048
70,"(1196, 260, 1198)",(1210),0.187779,0.265276,0.152012,0.809524,3.051632,0.102199,3.857303
71,"(1210, 1196)","(260, 1198)",0.216095,0.220566,0.152012,0.703448,3.189282,0.104349,2.628323
72,"(1210, 1198)","(1196, 260)",0.175857,0.251863,0.152012,0.864407,3.432053,0.107720,5.517511


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(47),(296),0.242921,0.400894,0.196721,0.809816,2.020024,0.099336,3.150137
1,(1196),(260),0.299553,0.368107,0.251863,0.840796,2.284106,0.141595,3.969076
2,(1210),(260),0.265276,0.368107,0.223547,0.842697,2.289269,0.125897,4.017032
3,(1221),(858),0.177347,0.277198,0.163934,0.92437,3.334689,0.114774,9.557046
4,(1210),(1196),0.265276,0.299553,0.216095,0.814607,2.719409,0.136631,3.778169
5,(1291),(1198),0.172876,0.295082,0.157973,0.913793,3.096743,0.10696,8.177049
6,(5952),(4993),0.229508,0.251863,0.210134,0.915584,3.635249,0.15233,8.862547
7,(4993),(5952),0.251863,0.229508,0.210134,0.83432,3.635249,0.15233,4.650468
8,(7153),(4993),0.223547,0.251863,0.201192,0.9,3.573373,0.144889,7.481371
9,(5952),(7153),0.229508,0.223547,0.198212,0.863636,3.863333,0.146906,5.693989


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1221),(858),0.177347,0.277198,0.163934,0.92437,3.334689,0.114774,9.557046
1,(1291),(1198),0.172876,0.295082,0.157973,0.913793,3.096743,0.10696,8.177049
2,(5952),(4993),0.229508,0.251863,0.210134,0.915584,3.635249,0.15233,8.862547
3,"(1210, 1196)",(260),0.216095,0.368107,0.196721,0.910345,2.473042,0.117175,7.048034
4,"(2571, 1196)",(260),0.183308,0.368107,0.165425,0.902439,2.451565,0.097948,6.4769
5,"(1210, 1198)",(260),0.175857,0.368107,0.160954,0.915254,2.486379,0.09622,7.456334
6,"(5952, 260)",(4993),0.153502,0.251863,0.150522,0.980583,3.893319,0.11186,38.529061
7,"(1210, 1198)",(1196),0.175857,0.299553,0.160954,0.915254,3.055401,0.108275,8.265276
8,"(5952, 2571)",(4993),0.175857,0.251863,0.165425,0.940678,3.734881,0.121133,12.611454
9,"(7153, 2571)",(4993),0.165425,0.251863,0.154993,0.936937,3.720028,0.113328,11.863317


### As confidence threshold increases, no of rules are decreased