In [11]:
!pip install jieba
!pip install openpyxl



In [12]:
import pandas as pd
import openpyxl
import numpy as np
import math
from csv import reader
import jieba
import jieba.analyse
import os
import itertools
import matplotlib.pyplot as plt
import zipfile

In [13]:

with zipfile.ZipFile('dataTrainComplete.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

with zipfile.ZipFile('dataPrivateComplete.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

In [14]:
filenames = [".".join(f.split(".")[:-1]) for f in os.listdir('data/dataTrainComplete') if os.path.isfile(os.path.join('data/dataTrainComplete', f))]
filenames[:10]

['733', '776', '1190', '654', '916', '1366', '276', '219', '273', '1357']

In [15]:
with open('data/TrainLabel.csv', 'r') as f:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(f)
    # Get all rows of csv from csv_reader object as list of tuples
    doc_ref_tuples = list(map(tuple, csv_reader))[1:]
    # display all rows of csv
    print(doc_ref_tuples[:10])

[('3', '415'), ('3', '649'), ('9', '5'), ('25', '32'), ('25', '41'), ('26', '37'), ('27', '46'), ('29', '72'), ('32', '25'), ('32', '41')]


In [16]:
doc_pairs = list(itertools.permutations(filenames,2))

In [17]:
# data is highly imbalanced
print("number of files: ", len(filenames))
print("number of similar files: ", len(doc_ref_tuples))
print("number of all possible pairs: ", len(doc_pairs))
print("ratio similar/dissimilar pairs: ", len(doc_ref_tuples)/len(doc_pairs))

number of files:  560
number of similar files:  1383
number of all possible pairs:  313040
ratio similar/dissimilar pairs:  0.004417965755175057


In [18]:
texts = []
for i, file in enumerate(filenames):
        with open('data/dataTrainComplete/' + file + '.txt', 'r', encoding='UTF-8') as f:
            f = f.read()
            texts.append(f)

In [19]:
# prepare training labels: 1 -> if docs are similar, 0 -> not similar
y_train = []
for pair in doc_pairs:
    if pair in doc_ref_tuples:
        y_train.append(1)
    else:
        y_train.append(0)

In [8]:
features_file = open("data/features.txt", "r")
features = features_file.read().splitlines()
features[:3]

['甜菜夜蛾', '黑點病', '軟腐病']

## Tokenization

In [11]:
jieba.load_userdict('data/features.txt') 

In [12]:
tokens = []
for t in texts:
  tokens.append(jieba.lcut(t))

In [14]:
#tokens[0]

If you want to use Python NLP toolkit to analyze Traditional Chinese text, CKIP is your first choice. CKIP is developed by Taiwan Institute of Information Science, Academia Sinica, And won rankings in many competitions.

In [3]:
!pip3 install ckiptagger
!pip3 install tensorflow
!pip3 install gdown

Collecting ckiptagger
  Downloading ckiptagger-0.2.1-py3-none-any.whl (34 kB)
Installing collected packages: ckiptagger
Successfully installed ckiptagger-0.2.1


In [4]:
# -*- coding: utf-8 -*-
from ckiptagger import data_utils
data_utils.download_data_gdown("./")

Downloading...
From: https://drive.google.com/uc?id=1efHsY16pxK0lBD2gYCgCTnv1Swstq771
To: /content/data.zip
100%|██████████| 1.88G/1.88G [00:18<00:00, 102MB/s]


In [22]:
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
import os

In [6]:
#ws = WS("./data")
#pos = POS("./data")
#ner = NER("./data")
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
ws = WS("./data", disable_cuda=False)
pos = POS("./data", disable_cuda=False)
ner = NER("./data", disable_cuda=False)

  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
  partitioner=maybe_partitioner)
  initializer=initializer)
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)
  cell = tf.compat.v1.nn.rnn_cell.LSTMCell(hidden_d, name=name)


In [27]:
feature_dct = {features[i]: 1 for i in range(len(features))}
feature_dict = construct_dictionary(feature_dct)
feature_dict[0]

(1,
 {' ': 1.0,
  '桃': 1.0,
  '桑': 1.0,
  '梅': 1.0,
  '梨': 1.0,
  '棗': 1.0,
  '茶': 1.0,
  '蔥': 1.0})

In [28]:
word_sentence_list = ws(
    texts,
    # sentence_segmentation = True, # To consider delimiters
    # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters
    # recommend_dictionary = dictionary1, # words in this dictionary are encouraged
    coerce_dictionary = feature_dict # words in this dictionary are forced
)

In [59]:
special_chars = ['，', '\n', '(', ')','。','、','：','\n\n','~','.','；','\u3000', '～', '℃', '（','）']

In [63]:
clean_word_sentence_list = [[] for i in range(len(word_sentence_list))]

for i, tokens in enumerate(word_sentence_list):
  for t in tokens:
    if t not in special_chars and not t.isdigit() and "％" not in t and "%" not in t and "." not in t and "," not in t:
      clean_word_sentence_list[i].append(t)

In [71]:
chem = pd.read_excel('data/Keywords/02chem.list.xlsx', engine='openpyxl', header=None)
crop = pd.read_excel('data/Keywords/02crop.list.xlsx', engine='openpyxl', header=None)
pest = pd.read_excel('data/Keywords/02pest.list.xlsx', engine='openpyxl', header=None)

rows = len(chem)
columns = len(chem.columns)
chems_syn = [[] for i in range(rows)]
for r in range(rows):
    for c in range(columns):
        if type(chem.iloc[r,c]) == str:
            chems_syn[r].append(chem.iloc[r,c])


rows = len(crop)
columns = len(crop.columns)
crops_syn = [[] for i in range(rows)]
for r in range(rows):
    for c in range(columns):
        if type(crop.iloc[r,c]) == str:
            crops_syn[r].append(crop.iloc[r,c])


rows = len(pest)
columns = len(pest.columns)
pests_syn = [[] for i in range(rows)]
for r in range(rows):
    for c in range(columns):
        if type(pest.iloc[r,c]) == str:
            pests_syn[r].append(pest.iloc[r,c])

In [72]:
all_keywords = chems_syn+crops_syn+pests_syn

In [74]:
syn_clean_tokens = clean_word_sentence_list.copy()
# replace by synonym
for i, tokens in enumerate(clean_word_sentence_list):
  for j, t in enumerate(tokens):
    for kw in all_keywords:
      if t in kw:
        syn_clean_tokens[i][j] = kw[0] #always choose first synonym

metrics

In [335]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [336]:
def get_metrics(testy, yhat_classes):
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(testy, yhat_classes)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(testy, yhat_classes)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(testy, yhat_classes)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(testy, yhat_classes)
    print('F1 score: %f' % f1)

# keyword set

In [77]:
kw_sets = [set() for i in range(len(syn_clean_tokens))]
for i, tokens in enumerate(syn_clean_tokens):
  for t in tokens:
    if t in features:
      kw_sets[i].add(t)

In [79]:
kw_sets[0]

{'柑桔', '白柚', '賽洛寧', '陶斯松'}

In [82]:
def jaccard_similarity(s1, s2):
    if len(s1.union(s2)) == 0: return 0
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [83]:
jaccard_scores = []
for (d1,d2) in doc_pairs:
  i1 = filenames.index(d1)
  i2 = filenames.index(d2)

  jaccard_scores.append(jaccard_similarity(kw_sets[i1], kw_sets[i2]))

In [87]:
sum(x == 0 for x in jaccard_scores)

211778

In [112]:
y_pred10 = [0]*len(doc_pairs)
#if pair has common keywords in all categories -> classify as similar
for i, pair in enumerate(doc_pairs):
  if jaccard_scores[i] >= 0.7:
    y_pred10[i] = 1
        
get_metrics(y_train, y_pred10)

Accuracy: 0.995895
Precision: 0.545037
Recall: 0.428778
F1 score: 0.479968


In [129]:
def sorensin_similarity(s1, s2):
    if (len(s1)+len(s1))==0 : return 0
    return float(2*len(s1.intersection(s2)) / (len(s1)+len(s1)))

In [130]:
sorensin_scores = []
for (d1,d2) in doc_pairs:
  i1 = filenames.index(d1)
  i2 = filenames.index(d2)

  sorensin_scores.append(sorensin_similarity(kw_sets[i1], kw_sets[i2]))

In [131]:
y_pred12 = [0]*len(doc_pairs)
#if pair has common keywords in all categories -> classify as similar
for i, pair in enumerate(doc_pairs):
  if sorensin_scores[i] >= 0.97: #0.609772
    y_pred12[i] = 1
        
get_metrics(y_train, y_pred12)

Accuracy: 0.996173
Precision: 0.554831
Recall: 0.676790
F1 score: 0.609772


In [132]:
def overlap_similarity(s1, s2):
    if len(s1) == 0 or len(s2)==0 : return 0
    return float(len(s1.intersection(s2)) / min(len(s1),len(s1)))

In [133]:
overlap_scores = []
for (d1,d2) in doc_pairs:
  i1 = filenames.index(d1)
  i2 = filenames.index(d2)

  overlap_scores.append(overlap_similarity(kw_sets[i1], kw_sets[i2]))

In [140]:
y_pred13 = [0]*len(doc_pairs)
#if pair has common keywords in all categories -> classify as similar
for i, pair in enumerate(doc_pairs):
  if overlap_scores[i] >= 0.97: #0.609772
    y_pred13[i] = 1
        
get_metrics(y_train, y_pred13)

Accuracy: 0.996173
Precision: 0.554831
Recall: 0.676790
F1 score: 0.609772


In [332]:
def twersky_similarity(s1, s2):
  a = 0.97
  b= 0.03
  if len(s1) == 0 or len(s2)==0 : return 0
  return float(len(s1.intersection(s2)) / (len(s1.intersection(s2))+a*len(s1.difference(s2))+b*len(s2.difference(s1))))

In [333]:
twersky_scores = []
for (d1,d2) in doc_pairs:
  i1 = filenames.index(d1)
  i2 = filenames.index(d2)

  twersky_scores.append(twersky_similarity(kw_sets[i1], kw_sets[i2]))

In [334]:
y_pred15 = [0]*len(doc_pairs)
#if pair has common keywords in all categories -> classify as similar
for i, pair in enumerate(doc_pairs):
  if twersky_scores[i] >= 0.92: #0.665923 0.92 0.97 for a
    y_pred15[i] = 1
        
get_metrics(y_train, y_pred15)  # Good score here but when uploading, we get 0.0037950

Accuracy: 0.997131
Precision: 0.685824
Recall: 0.647144
F1 score: 0.665923


## tf idf

In [337]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [98]:
doctoanalyze = []
for tokens in syn_clean_tokens:
  doc_with_space=''
  for token in tokens:
      doc_with_space += token
      doc_with_space += ' '
  doctoanalyze.append(doc_with_space)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(doctoanalyze)

cosinescore = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [114]:
y_pred11 = [0]*len(doc_pairs)
#if pair has common keywords in all categories -> classify as similar
for i, (d1,d2) in enumerate(doc_pairs):
  idx1 = filenames.index(d1)
  idx2 = filenames.index(d2)
  if cosinescore[idx1][idx2] >= 0.2 and jaccard_scores[i] >= 0.7:
    y_pred11[i] = 1
        
get_metrics(y_train, y_pred11)

Accuracy: 0.995914
Precision: 0.548327
Recall: 0.426609
F1 score: 0.479870


# test on private data

In [141]:
with zipfile.ZipFile('dataPrivateComplete.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

In [142]:
filenames_test = [".".join(f.split(".")[:-1]) for f in os.listdir('data/dataPrivateComplete') if os.path.isfile(os.path.join('data/dataPrivateComplete', f))]
filenames_test[:10]

['768', '1171', '166', '322', '1224', '708', '674', '976', '1211', '1003']

In [143]:
doc_pairs_test = list(itertools.permutations(filenames_test,2))


In [144]:
texts_test = []
for i, file in enumerate(filenames_test):
        with open('data/dataPrivateComplete/' + file + '.txt', 'r', encoding='UTF-8') as f:
            f = f.read()
            texts_test.append(f)

In [145]:
word_sentence_list_test = ws(
    texts_test,
    # sentence_segmentation = True, # To consider delimiters
    # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}), # This is the defualt set of delimiters
    # recommend_dictionary = dictionary1, # words in this dictionary are encouraged
    coerce_dictionary = feature_dict # words in this dictionary are forced
)

In [146]:
clean_word_sentence_list_test = [[] for i in range(len(word_sentence_list_test))]

for i, tokens in enumerate(word_sentence_list_test):
  for t in tokens:
    if t not in special_chars and not t.isdigit() and "％" not in t and "%" not in t and "." not in t and "," not in t:
      clean_word_sentence_list_test[i].append(t)

In [147]:
syn_clean_tokens_test = clean_word_sentence_list_test.copy()
# replace by synonym
for i, tokens in enumerate(clean_word_sentence_list_test):
  for j, t in enumerate(tokens):
    for kw in all_keywords:
      if t in kw:
        syn_clean_tokens_test[i][j] = kw[0] #always choose first synonym

In [148]:
kw_sets_test = [set() for i in range(len(syn_clean_tokens_test))]
for i, tokens in enumerate(syn_clean_tokens_test):
  for t in tokens:
    if t in features:
      kw_sets_test[i].add(t)

In [278]:
sorensin_scores_test = []
#twersky_scores_test = []
for (d1,d2) in doc_pairs_test:
  i1 = filenames_test.index(d1)
  i2 = filenames_test.index(d2)

  sorensin_scores_test.append(sorensin_similarity(kw_sets_test[i1], kw_sets_test[i2]))

In [279]:
y_pred14 = [0]*len(doc_pairs_test)
#if pair has common keywords in all categories -> classify as similar
for i, pair in enumerate(doc_pairs_test):
  if sorensin_scores_test[i] >= 0.97: #0.609772 #twersky_scores[i] >= 0.92:
    y_pred14[i] = 1

In [280]:
res_test = []

#y_pred_nb_test = nb.predict(common_kw_list_test)

for i, pair in enumerate(doc_pairs_test):
  if y_pred14[i] == 1: #clf_under.predict([X_test[i]]) == 1:
      res_test.append(pair)

In [277]:
test_gen = [['Test','Reference']] + res_test
            
import csv

with open('data/test_result.csv', 'w', encoding='UTF-8', newline='') as f:
    writer = csv.writer(f)
    for pair in test_gen:
        writer.writerow(pair)