In [1]:
corpus = open('dataset3.txt').read()

In [2]:
corpus

'milk bread bread bread : grocery\nbread milk milk bread : grocery\nmilk milk milk bread bread bread bread : grocery\ncat cat cat dog dog bark : pets\ndog dog cat bark mew mew : pets\ncat dog cat dog mew cat : pets\n'

In [3]:
docs = corpus.split('\n')

In [4]:
docs

['milk bread bread bread : grocery',
 'bread milk milk bread : grocery',
 'milk milk milk bread bread bread bread : grocery',
 'cat cat cat dog dog bark : pets',
 'dog dog cat bark mew mew : pets',
 'cat dog cat dog mew cat : pets',
 '']

In [10]:
x, y = [], []

for doc in docs:
    if doc != '':
        i, l = doc.split(':')
        x.append(i.strip())
        y.append(l.strip())

print(x)
print(y)

['milk bread bread bread', 'bread milk milk bread', 'milk milk milk bread bread bread bread', 'cat cat cat dog dog bark', 'dog dog cat bark mew mew', 'cat dog cat dog mew cat']
['grocery', 'grocery', 'grocery', 'pets', 'pets', 'pets']


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
vec = CountVectorizer()

In [13]:
matrix_x = vec.fit_transform(x)

In [14]:
matrix_x

<6x6 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [15]:
matrix_x.toarray()

array([[0, 3, 0, 0, 0, 1],
       [0, 2, 0, 0, 0, 2],
       [0, 4, 0, 0, 0, 3],
       [1, 0, 3, 2, 0, 0],
       [1, 0, 1, 2, 2, 0],
       [0, 0, 3, 2, 1, 0]])

In [16]:
vec.vocabulary_

{'milk': 5, 'bread': 1, 'cat': 2, 'dog': 3, 'bark': 0, 'mew': 4}

## 3.2.1 Classification

### K-Nearest Neighbours Classification
### Predicting label for 6th document by training model on first five docs

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
knn = KNeighborsClassifier(n_neighbors = 2) # vary this n and see predict_proba

In [28]:
# model will find relation b/w data and labels; it will be trained

knn.fit(matrix_x[:5], y[:5])  #data(matrix) and label

KNeighborsClassifier(n_neighbors=2)

In [29]:
knn.predict(matrix_x[5])

array(['pets'], dtype='<U7')

In [30]:
knn.predict_proba(matrix_x[5])

array([[0., 1.]])

### Naive Bayes Classification

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
nbc = MultinomialNB()

In [33]:
nbc.fit(matrix_x[:5], y[:5])

MultinomialNB()

In [34]:
nbc.predict(matrix_x[5])

array(['pets'], dtype='<U7')

In [35]:
nbc.predict_proba(matrix_x[5])   # ??

array([[6.34470873e-05, 9.99936553e-01]])

### Decision Tree Classification

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
dtc = DecisionTreeClassifier()

In [38]:
dtc.fit(matrix_x[:5], y[:5])

DecisionTreeClassifier()

In [39]:
dtc.predict(matrix_x[5])

array(['pets'], dtype='<U7')

In [40]:
dtc.predict_proba(matrix_x[5])

array([[0., 1.]])

### Linear Classifier

In [41]:
from sklearn.linear_model import SGDClassifier

In [42]:
linear = SGDClassifier()

In [43]:
linear.fit(matrix_x[:5], y[:5])

SGDClassifier()

In [44]:
linear.predict(matrix_x[5])

array(['pets'], dtype='<U7')

### 3.2.2 Classifiers with different parameter settings

In [45]:
knn = KNeighborsClassifier(n_neighbors = 3, algorithm = 'brute', weights = 'distance')  #algo to evaluate distance b/w unseen doc and training data
# 'brute' is not good for large datasets
# kd_tree makes a tree of data and calculates distances from the nearest instances
# 'distance' the docs that are closest among the tree there vote will have more weight than other docs

In [46]:
nb = MultinomialNB(alpha = 1.0, class_prior = [0.7, 0.3])
# alpha: smoothing factor
# fit_prior = 'true': keep the class uniform
# class prior: favoring grocery class with 0.7

In [47]:
dt = DecisionTreeClassifier(max_depth = 2)
# max_depth: max depth of tree

In [48]:
ln = SGDClassifier(max_iter = 1000)

### 3.2.3 Classification with UCI Dataset

In [49]:
corpus = open('badges.data').read()

In [50]:
docs = corpus.split('\n')

In [51]:
x, y = [], []

for doc in docs:
    if doc != '':
        b = doc[0]
        i = doc[2:]
        x.append(i)
        y.append(b)

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
vec = TfidfVectorizer()

In [54]:
matrix_x = vec.fit_transform(x)

In [55]:
matrix_x

<294x523 sparse matrix of type '<class 'numpy.float64'>'
	with 607 stored elements in Compressed Sparse Row format>

In [56]:
knn = KNeighborsClassifier()

In [57]:
knn.fit(matrix_x[:284], y[:284])

KNeighborsClassifier()

In [58]:
knn.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [59]:
knn.predict_proba(matrix_x[284:])

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [133]:
#################################################

### Assignment

In [185]:
corpus = open('badges.data').read()

In [186]:
docs = corpus.split('\n')

In [187]:
x, y = [], []

for doc in docs:
    if doc != '':
        b = doc[0]
        i = doc[2:]
        x.append(i)
        y.append(b)

In [188]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

In [189]:
vec = TfidfVectorizer()

In [190]:
matrix_x = vec.fit_transform(x)

In [191]:
matrix_x

<294x523 sparse matrix of type '<class 'numpy.float64'>'
	with 607 stored elements in Compressed Sparse Row format>

In [192]:
dtc = DecisionTreeClassifier(max_depth = 2)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [193]:
dtc = DecisionTreeClassifier(max_depth = 3)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [194]:
dtc = DecisionTreeClassifier(max_depth = 4)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [195]:
dtc = DecisionTreeClassifier(max_depth = 5)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [196]:
dtc = DecisionTreeClassifier(max_depth = 6)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [197]:
dtc = DecisionTreeClassifier(max_depth = 10)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')

In [198]:
dtc = DecisionTreeClassifier(max_depth = 20)

dtc.fit(matrix_x[:284], y[:284])

dtc.predict(matrix_x[284:])

array(['+', '+', '+', '+', '+', '+', '+', '-', '+', '+'], dtype='<U1')