In [4]:
# task1a.py
import pandas as pd
import numpy as np
import csv
import textdistance

# read corresponding files
amazon_small = pd.read_csv('amazon_small.csv')
google_small = pd.read_csv('google_small.csv')
amazon_google_truth_small = pd.read_csv('amazon_google_truth_small.csv')

# create a list to install all the product pairs
total_pair = []

# select columns separately from two datasets
for i in range(amazon_small.shape[0]):
    max_num = 0
    
    amazon_title = amazon_small.loc[i]["title"]
    amazon_des = amazon_small["description"]
    amazon_manu = amazon_small["manufacturer"]
    amazon_price = amazon_small["price"]
    
    for j in range(google_small.shape[0]):
        google_title = google_small.loc[j]["name"]
        google_des = google_small["description"]
        google_manu = google_small["manufacturer"]
        google_price = google_small["price"]
        
        t1 = amazon_title.split()
        t2 = google_title.split()
        
        # calculate the difference      
        sim1 = textdistance.jaccard(t1, t2)
        sim2 = textdistance.jaccard(str(amazon_des[i]), str(google_price[j]))
        sim3 = textdistance.jaccard(str(amazon_manu[i]), str(google_manu[j]))
        sim4 = textdistance.jaccard(str(amazon_price[i]), str(google_price[j]))
        
        # weight the difference    
        total_sim = sim1*0.3 + sim2*0.2 + sim3*0.1 + sim4*0.4
    
        # find the biggest similarity under each amazon id
        if total_sim > max_num:
            max_num = total_sim
            amazonid = amazon_small.loc[i]["idAmazon"]
            googleid = google_small.loc[i]["idGoogleBase"]

    # create a threshold 
    if max_num > 0.40:
        total_pair.append([amazonid, googleid])

# create a csv file
with open ("task1a.csv", 'a', newline = '') as f:
    writer = csv.writer(f)
    writer.writerow(['idAmazon', 'idGoogleBase'])
    for i in range(len(total_pair)):
        writer.writerow(total_pair[i])

In [5]:
# task1b.py
# read all corresponding csv files
amazon = pd.read_csv('amazon.csv')
google = pd.read_csv('google.csv')
amazon_google_truth = pd.read_csv('amazon_google_truth.csv')
# create a function to calculate gdp in aud
def gdp_to_aud(price):
    price = float(price.split()[0])*1.88
    return price

# convert gdp to aud
for i in range(google.shape[0]):
    try:
        float(google.loc[i]["price"])
    except:
        google.loc[i]["price"] = gdp_to_aud(google.loc[i]["price"])
    try:
        google["price"][i] = float(google["price"][i])
    except:
        google["price"][i] = np.nan

# create bin size and range
numlist = [num for num in range(0, 500, 20)]
for num in range(500, 1000, 100):
    numlist.append(num)
for num in range(1000, 10000, 1000):
    numlist.append(num)
for num in range(10000, 60000, 10000):
    numlist.append(num)
label = [num for num in range(len(numlist)-1)]

google["price"] = pd.cut(google["price"], bins = numlist, labels = label)
amazon["price"] = pd.cut(amazon["price"], bins = numlist, labels = label)

block_key = set()
for i in amazon["price"]:
    block_key.add(str(i))
for i in google["price"]:
    block_key.add(str(i))  

# put everything into the blocks
block_d = {}
for keys in block_key:
    d = {}
    d["Amazon"] = []
    d["Google"] = []
    block_d[keys] = d
block_d[np.nan] = d

for i in range(amazon.shape[0]):
    try:
        block_d[amazon.loc[i]["price"]]["Amazon"].append(amazon.loc[i]["idAmazon"])
    except:
        block_d[np.nan]["Amazon"].append(amazon.loc[i]["idAmazon"])
for i in range(google.shape[0]):
    try:
        block_d[google.loc[i]["price"]]["Google"].append(google.loc[i]["id"])
    except:
        block_d[np.nan]["Google"].append(google.loc[i]["id"])   

# save the dataframe to csv file
amazon_df = pd.DataFrame({"blocking_keys": amazon["price"], "product_id": amazon["idAmazon"]})
google_df = pd.DataFrame({"blocking_keys": google["price"], "product_id": google["id"]})
amazon_df.to_csv("amazon_blocks.csv")
google_df.to_csv("google_blocks.csv")

In [11]:
# task2a.py

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score

world = pd.read_csv("world.csv")
life = pd.read_csv("life.csv")
life_world = life.merge(world.iloc[:,2:], how = 'left')
life_world = life_world.replace("..", np.nan)
mean_list = []
std_list = []

data = life_world.iloc[:, 4:]
y = life_world["Life expectancy at birth (years)"]

median_list = [i for i in data.median(skipna=True)]

for column in data.columns:
    median = data[column].median()
    data[column].fillna(median, inplace = True)
#     X_test[column].fillna(median, inplace = True)
X_train, X_test, y_train, y_test = train_test_split(data, y, train_size = 2/3, test_size=1/3, random_state = 100)

scaler = StandardScaler()
standardised = data.copy()
standardised.iloc[:, :] = scaler.fit_transform(standardised.iloc[:, :])

mean = pd.DataFrame(data = standardised.mean())

for i in range(len(standardised.mean())):
    row = mean.iloc[i].values
    mean_list.append(float(row))
    
std = pd.DataFrame(data = standardised.std())

for i in range(len(std)):
    row = std.iloc[i].values
    std_list.append(float(row))
    
X_train = StandardScaler(with_mean = True, with_std = True).fit_transform(X_train)
X_test = StandardScaler(with_mean = True, with_std = True).fit_transform(X_test)

# Decision Tree
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)
val1 = round(clf.score(X_test, y_test)*100,3)
print(f'Accuracy of decision tree: {val1}%')

# knn = 5
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
val2 = round(knn5.score(X_test, y_test)*100,3)
print(f'Accuracy of k-nn (k=5):{val2}%')

# knn = 10
knn10 = KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train, y_train)
val3 = round(knn10.score(X_test, y_test)*100, 3)
print(f'Accuracy of k-nn (k=10):{val3}%')

# save the dataframe to csv file
task2a = pd.DataFrame({"feature": data.columns, "median": median_list, "mean": mean_list, "variance": std_list})
task2a.to_csv("task2a.csv")

Accuracy of decision tree: 72.131%
Accuracy of k-nn (k=5):81.967%
Accuracy of k-nn (k=10):83.607%


In [18]:
# task2b.py
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_mutual_info_score
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# find feature * feature
poly = PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)
new_features = poly.fit_transform(data)

new_features = pd.DataFrame(new_features)

# the name of the columns
col_name = []
for i in range(data.shape[1]):
    for j in range(i, data.shape[1]):
        if i != j:
            col_name.append(f"f{i}*f{j}")
            
full_col_name = []
for i in data.columns:
    full_col_name.append(i)
for i in col_name:
    full_col_name.append(i)
    
new_features.columns = full_col_name

# add one column to the dataframe
kmeans = KMeans(n_clusters=3).fit(new_features)
new_features["f_clusterabel"] = kmeans.labels_

shape = data.shape[1]
# find the accuracy by MI

# calculate MI for each feature
MI_dict = {}
MI_name = []
feature_list = []
for i in range(shape):
    for j in range(i, shape):
        if i != j:
            MI_dict[f"f{i}*f{j}"] = mutual_info_score(data.iloc[:,i].astype('float'), data.iloc[:,j].astype('float'))

MI_dict = sorted(MI_dict.items(),key = lambda x:x[1],reverse = True)
MI_dict_most = MI_dict[:4]

for i in MI_dict_most:
    MI_name.append(i[0])
for i in MI_name:
    feature_list.append(new_features[i])
feature_list = pd.DataFrame(feature_list).transpose()

# split training and testing data
X_train, X_test, y_train, y_test = train_test_split(feature_list, y, train_size = 2/3, test_size=1/3, random_state = 100)

# train knn
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)

# test accuracy
y_test_pred = knn5.predict(X_test)
test = round(accuracy_score(y_test, y_test_pred)*100, 3)

# train accuracy
y_train_pred = knn5.predict(X_train)
train = round(accuracy_score(y_train, y_train_pred)*100, 3)

val1 = round(knn5.score(X_test, y_test)*100,3)

print(f'Train Accuracy(MI): {train}%')
print(f'Test Accuracy(MI): {test}%')
print(f'Accuracy of k-nn (k=5) by using MI:{val1}%')
print("===========================================")

# find accruacy by AMI
AMI_dict = {}
AMI_name = []
feature_list = []
for i in range(shape):
    for j in range(i, shape):
        if i != j:
            AMI_dict[f"f{i}*f{j}"] = adjusted_mutual_info_score(data.iloc[:,i].astype('float'), data.iloc[:,j].astype('float'))

AMI_dict = sorted(AMI_dict.items(),key = lambda x:x[1],reverse = True)
AMI_dict_most = AMI_dict[:4]

for i in AMI_dict_most:
    AMI_name.append(i[0])
for i in AMI_name:
    feature_list.append(new_features[i])
feature_list = pd.DataFrame(feature_list).transpose()

X_train, X_test, y_train, y_test = train_test_split(feature_list, y, train_size = 2/3, test_size=1/3, random_state = 100)
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
y_test_pred = knn5.predict(X_test)
test = round(accuracy_score(y_test, y_test_pred)*100, 3)
y_train_pred = knn5.predict(X_train)
train = round(accuracy_score(y_train, y_train_pred)*100, 3)

val1 = round(knn5.score(X_test, y_test)*100,3)

print(f'Train Accuracy: {train}%')
print(f'Test Accuracy: {test}%')
print(f'Accuracy of k-nn (k=5) by using AMI:{val1}%')
print("===========================================")

# find accuracy by NMI
NMI_dict = {}
NMI_name = []
feature_list = []
for i in range(shape):
    for j in range(i, shape):
        if i != j:
            NMI_dict[f"f{i}*f{j}"] = normalized_mutual_info_score(data.iloc[:,i].astype('float'), data.iloc[:,j].astype('float'))

NMI_dict = sorted(NMI_dict.items(),key = lambda x:x[1],reverse = True)
NMI_dict_most = NMI_dict[:4]

for i in NMI_dict_most:
    NMI_name.append(i[0])
for i in NMI_name:
    feature_list.append(new_features[i])
feature_list = pd.DataFrame(feature_list).transpose()

X_train, X_test, y_train, y_test = train_test_split(feature_list, y, train_size = 2/3, test_size=1/3, random_state = 100)
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
y_test_pred = knn5.predict(X_test)
test = round(accuracy_score(y_test, y_test_pred)*100, 3)
y_train_pred = knn5.predict(X_train)
train = round(accuracy_score(y_train, y_train_pred)*100, 3)

val1 = round(knn5.score(X_test, y_test)*100,3)

print(f'Train Accuracy: {train}%')
print(f'Test Accuracy: {test}%')
print(f'Accuracy of k-nn (k=5) by using NMI:{val1}%')
print("===========================================")

# find accruacy by pca
pca = PCA(n_components=4)
X_reduced = pd.DataFrame(pca.fit_transform(new_features))

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, train_size = 2/3, test_size=1/3, random_state = 100)
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
y_test_pred = knn5.predict(X_test)
test = round(accuracy_score(y_test, y_test_pred)*100, 3)
y_train_pred = knn5.predict(X_train)
train = round(accuracy_score(y_train, y_train_pred)*100, 3)

val2 = round(knn5.score(X_test, y_test)*100,3)

print(f'Train Accuracy: {train}%')
print(f'Test Accuracy: {test}%')
print(f'Accuracy of k-nn (k=5) by using pca:{val2}%')
print("===========================================")

# find accuracy by the first four columns
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 0:4], y, train_size = 2/3, test_size=1/3, random_state = 100)
knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train, y_train)
y_test_pred = knn5.predict(X_test)
test = round(accuracy_score(y_test, y_test_pred)*100, 3)
y_train_pred = knn5.predict(X_train)
train = round(accuracy_score(y_train, y_train_pred)*100, 3)

val3 = round(knn5.score(X_test, y_test)*100, 3)

print(f'Train Accuracy: {train}%')
print(f'Test Accuracy: {test}%')
print(f'Accuracy of k-nn (k=5) by selecting the first four columns:{val3}%')

Train Accuracy(MI): 85.246%
Test Accuracy(MI): 78.689%
Accuracy of k-nn (k=5) by using MI:78.689%
Train Accuracy: 73.77%
Test Accuracy: 72.131%
Accuracy of k-nn (k=5) by using AMI:72.131%
Train Accuracy: 85.246%
Test Accuracy: 78.689%
Accuracy of k-nn (k=5) by using NMI:78.689%
Train Accuracy: 82.787%
Test Accuracy: 80.328%
Accuracy of k-nn (k=5) by using pca:80.328%
Train Accuracy: 81.967%
Test Accuracy: 75.41%
Accuracy of k-nn (k=5) by selecting the first four columns:75.41%
