# icma46 notebook

# Q1 - kNN classifier exercise

In [2]:
# standard imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# import KNeighbours classifier
from sklearn.neighbors import KNeighborsClassifier

In [5]:
# helper functions
def normalise(lowerlimit, upperlimit, x):
    ''' scales a value to the range defined by the lower 
        and upper limits provided [0-1]
    '''
    return ((x-lowerlimit)/(upperlimit-lowerlimit))

In [7]:
def normalise_dataframe(df):
    ''' scales values in a dataframe
        assumes columns are numerical that can be scaled
        returns a new dataFrame of the scaled values
    '''
    # set up resultant DataFrame
    res = pd.DataFrame()
    # Iterate through each column
    for c in df.columns:
        # apply the normalise function to each value in the column
        # assign it to a new column in the resultant dataframe 
        res[c] = df[c].apply(lambda x: normalise(max(df[c]), min(df[c]), x))
    # return the resultant DataFrame
    return res

In [9]:
# import the iris measurement and classification data
irisdata_df = pd.read_csv('iCMA46/iris_data.csv')
irisdata_df.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Species
0,5.1,3.5,1.4,0.2,I. setosa
1,4.9,3.0,1.4,0.2,I. setosa
2,4.7,3.2,1.3,0.2,I. setosa
3,4.6,3.1,1.5,0.2,I. setosa
4,5.0,3.6,1.4,0.2,I. setosa


In [12]:
# normalise the columns required
n_iris_training_df = normalise_dataframe(irisdata_df[['Sepal length',
                                                     'Sepal width',
                                                     'Petal length',
                                                     'Petal width']])
# check it looks ok
n_iris_training_df.head()


Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width
0,0.777778,0.375,0.932203,0.958333
1,0.833333,0.583333,0.932203,0.958333
2,0.888889,0.5,0.949153,0.958333
3,0.916667,0.541667,0.915254,0.958333
4,0.805556,0.333333,0.932203,0.958333


In [16]:
# create the target values series
iris_target_values = irisdata_df['Species']

In [28]:
# create a DataFrame of the test data
test_iris_data_df = pd.DataFrame(
    {'Sample': ['Sample 1','Sample 2','Sample 3','Sample 4'],
                    'Sepal length':  [4.32, 4.74, 6.20, 7.19],
                    'Sepal width': [3.35, 2.96, 3.06, 3.34],
                     'Petal length': [2.21, 6.46, 6.37, 1.81],
                    'Petal width': [0.38, 0.24, 1.46, 1.66]
                    })
test_iris_data_df.head()

Unnamed: 0,Petal length,Petal width,Sample,Sepal length,Sepal width
0,2.21,0.38,Sample 1,4.32,3.35
1,6.46,0.24,Sample 2,4.74,2.96
2,6.37,1.46,Sample 3,6.2,3.06
3,1.81,1.66,Sample 4,7.19,3.34


In [29]:
# normalise the test_iris_data_df
n_test_iris_df = normalise_dataframe(test_iris_data_df[['Sepal length',
                                                      'Sepal width',
                                                      'Petal length',
                                                      'Petal width']])
n_test_iris_df.head()

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width
0,1.0,-0.0,0.913978,0.901408
1,0.853659,1.0,-0.0,1.0
2,0.344948,0.74359,0.019355,0.140845
3,-0.0,0.025641,1.0,-0.0


In [30]:
# create a classifier with the required k value (n_neighbors)
iris_classifier = KNeighborsClassifier(n_neighbors=3, metric='euclidean', weights='uniform')

In [31]:
# train the classifier with the training data and target values
iris_classifier.fit(n_iris_training_df, iris_target_values)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [32]:
# get the results of classification
results = iris_classifier.predict(n_test_iris_df)

In [37]:
# add to dataframe to make viewing easier
test_iris_data_df['Classification'] = results
test_iris_data_df[['Sample','Sepal length', 'Sepal width', 'Petal length', 'Petal width', 'Classification']]

Unnamed: 0,Sample,Sepal length,Sepal width,Petal length,Petal width,Classification
0,Sample 1,4.32,3.35,2.21,0.38,I. setosa
1,Sample 2,4.74,2.96,6.46,0.24,I. versicolor
2,Sample 3,6.2,3.06,6.37,1.46,I. virginica
3,Sample 4,7.19,3.34,1.81,1.66,I. virginica


# Q3 - K-means clustering exercise

In [None]:
# standard imports and sklearn.cluster
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn import cluster

In [None]:
!head -5 'iCMA46/icma_cluster_question.csv'

In [None]:
# create a df from the csv file
icma_cluster_df = pd.read_csv('iCMA46/icma_cluster_question.csv')
icma_cluster_df.head()

In [None]:
initial_centroids_df = pd.DataFrame({'A': [3, 60, 73], 
                                     'B': [69, 45, 20], 
                                     'C':[89, 34, 22],
                                    'D':[7, -21, -5]})

initial_centroids_df = pd.DataFrame({'X': [3, 69, 89, 7], 
                                     'Y': [60, 45, 34, -21],
                                     'Z':[73, 20, 22, -5]})

initial_centroids_df

In [None]:
kmeans4 = cluster.KMeans(n_clusters=4, init=initial_centroids_df)

In [None]:
assignedClusters_clust = kmeans4.fit(icma_cluster_df)

assignedClusters_clust.labels_

In [None]:
for i in range(4):
    print(i, ': ')
    print(list(assignedClusters_clust.labels_).count(i))
    print('\n')

# Question 5 - vector spce model.

In [None]:
def tokenise_document(docIn_str):
    '''Return a list of the tokens in the input string docIN_str'''
    return docIn_str.split()

In [None]:
def build_term_index(tokenisedDocuments_coll):
    ''' Return a set of all the terms appearing in 
        the documents in tokenisedDocuments_coll
    '''
    allTerms_set = set() # store the tokens as a set to remove repetitions
    
    for tokens_coll in tokenisedDocuments_coll:
        allTerms_set = allTerms_set.union(set(tokens_coll))
    
    return list(allTerms_set) # Return the members as a list


In [None]:
# importing the counter function to count all occurances of a term
from collections import Counter

In [None]:
doc_coll = ['the Coleoptera includes more species than any other order and constitute almost a quarter of all known types of animal life forms',
           'the Curculionidae comprise the family of the true weevils and is the third largest animal family']

In [None]:
tokenised_docs_ls = [tokenise_document(doc_txt) for doc_txt in doc_coll]
tokenised_docs_ls

In [None]:
term_index_ls = build_term_index(tokenised_docs_ls)
term_index_ls

In [None]:
def build_tf_vector(tokenisedDocument_ls, termIndex_ls):
    '''Return a pandas Series representing the term 
       frequency vector of the tokenised document 
       tokenisedDocument_ls, and indexed with termIndex_ls
    '''
    
    return pd.Series(Counter(tokenisedDocument_ls),
                     index=termIndex_ls).fillna(0)

In [None]:
termFreq_1_ss = build_tf_vector(tokenised_docs_ls[0], termIndex_ls=term_index_ls)

In [None]:
termFreq_2_ss = build_tf_vector(tokenised_docs_ls[1], termIndex_ls=term_index_ls)

In [None]:
# import scipy.spatial.distance.cosine function
from scipy.spatial.distance import cosine

In [None]:
cosine(termFreq_1_ss, termFreq_2_ss)

# Question 6

In [None]:
doc_coll2 = ['Victor Marie Hugo was a French poet and dramatist of the Romantic movement',
            'Moliere was a French playwright and actor who is considered to be one of the greatest masters of comedy in Western literature',
            'Alexandre Dumas was a French writer whose works have been translated into nearly 100 languages',
            'Jose Zorrilla y Moral was a Spanish Romantic poet and dramatist',
            'William Blake was an English poet who is now considered a seminal figure in the history of the poetry and visual arts of the Romantic Age']
doc_coll2

In [None]:
tokenised_docs_ls = [tokenise_document(doc_txt) for doc_txt in doc_coll2]
# build term index
term_index_ls = build_term_index(tokenised_docs_ls)



In [None]:
from scipy.spatial.distance import cosine

In [None]:
term_index_ls = build_term_index(tokenised_docs_ls)
# results list for easy comparison
res_list = []
for i in range(len(doc_coll2)):
    termFreq = build_tf_vector(tokenised_docs_ls[i], term_index_ls)
    if i == 0:
        sen1_tf = termFreq
    else:
        res_list.append([i, cosine(sen1_tf, termFreq)])

res_list