In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression,LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score , confusion_matrix, precision_score, recall_score, f1_score

In [2]:
raw_df = pd.read_csv("/Users/apple/Desktop/deep_eye/dataset/QCP_sequence/AC_totalpq_QCP_sequence.csv")
x_raw_df = raw_df.iloc[:,5:]
y_raw_df = raw_df.iloc[:,4]

In [3]:
x_raw_list = [[y for y in x if pd.notna(y)] for x in x_raw_df.values.tolist()]
x_raw_list

[['Q',
  'Q',
  'Q',
  'QQ',
  'C',
  'C',
  'CCC',
  'C',
  'C',
  'C',
  'Q',
  'Q',
  'Q',
  'Q',
  'Q',
  'Q',
  'C',
  'C',
  'Q',
  'QQ',
  'QQ',
  'Q',
  'CC',
  'C',
  'C',
  'CC',
  'Q',
  'Q',
  'C',
  'CC',
  'CC',
  'CC',
  'CC',
  'C',
  'CCC',
  'C',
  'CCC'],
 ['Q',
  'Q',
  'QQ',
  'QQ',
  'Q',
  'C',
  'Q',
  'Q',
  'Q',
  'QQQ',
  'P',
  'PP',
  'Q',
  'Q',
  'QQ',
  'Q',
  'Q',
  'C',
  'C',
  'CC',
  'C',
  'C',
  'C',
  'C',
  'CC',
  'CCCCCCC',
  'CC',
  'CC',
  'C',
  'CC',
  'CC',
  'C',
  'C',
  'C',
  'CC',
  'C',
  'C',
  'CC',
  'C',
  'C',
  'C',
  'C',
  'CC',
  'QQQQQQ'],
 ['P',
  'Q',
  'Q',
  'QQ',
  'Q',
  'Q',
  'QQ',
  'C',
  'Q',
  'Q',
  'QQ',
  'Q',
  'P',
  'P',
  'PPPP',
  'P',
  'C',
  'CC',
  'CC',
  'C',
  'Q',
  'Q',
  'Q',
  'Q',
  'QQ',
  'Q',
  'Q',
  'Q',
  'P',
  'PP',
  'P',
  'PPPP',
  'Q',
  'C',
  'C',
  'C',
  'C',
  'Q',
  'QQQ',
  'Q',
  'Q',
  'Q',
  'QQQ',
  'PP',
  'QQ',
  'QQ',
  'P',
  'PP',
  'C',
  'C'],
 ['P',
  'Q',
  'Q

In [4]:
# turn x_raw_df categorical data to number
word_to_index = {}
for sentence in x_raw_list:
    for word in sentence:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)

word_to_index

{'Q': 0,
 'QQ': 1,
 'C': 2,
 'CCC': 3,
 'CC': 4,
 'QQQ': 5,
 'P': 6,
 'PP': 7,
 'CCCCCCC': 8,
 'QQQQQQ': 9,
 'PPPP': 10,
 'CCCC': 11,
 'PPP': 12,
 'PPPPP': 13,
 'PPPPPP': 14,
 'CCCCCCCCCC': 15,
 'CCCCC': 16,
 'QQQQ': 17,
 'PPPPPPPP': 18,
 'CCCCCC': 19,
 'QQQQQ': 20,
 'QQQQQQQQ': 21,
 'PPPPPPPPPP': 22,
 'QQQQQQQ': 23,
 'CCCCCCCCC': 24,
 'CCCCCCCCCCCC': 25,
 'CCCCCCCC': 26,
 'PPPPPPPPPPP': 27,
 'QQQQQQQQQQQQQQQQQ': 28,
 'PPPPPPP': 29,
 'QQQQQQQQQ': 30,
 'QQQQQQQQQQQ': 31,
 'PPPPPPPPP': 32,
 'QQQQQQQQQQQQ': 33,
 'CCCCCCCCCCC': 34,
 'PPPPPPPPPPPPPPPP': 35,
 'PPPPPPPPPPPP': 36,
 'QQQQQQQQQQ': 37,
 'CCCCCCCCCCCCCCCCC': 38,
 'PPPPPPPPPPPPPPPPPPPP': 39,
 'CCCCCCCCCCCCC': 40,
 'QQQQQQQQQQQQQQ': 41,
 'PPPPPPPPPPPPPPPPP': 42,
 'PPPPPPPPPPPPPPP': 43,
 'PPPPPPPPPPPPPP': 44,
 'PPPPPPPPPPPPPPPPPPPPPPPPP': 45,
 'PPPPPPPPPPPPPPPPPP': 46,
 'PPPPPPPPPPPPPPPPPPPPPPPPPP': 47,
 'PPPPPPPPPPPPP': 48,
 'QQQQQQQQQQQQQ': 49,
 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCC': 50,
 'CCCCCCCCCCCCCCCC': 51,
 'QQQQQQQQQQQQQQQ': 52,
 

In [5]:

categ_index_list = []
for seq in x_raw_list:
    categ_to_index = [word_to_index[w] for w in seq]
    categ_index_list.append(categ_to_index)
categ_index_list

[[0,
  0,
  0,
  1,
  2,
  2,
  3,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  2,
  0,
  1,
  1,
  0,
  4,
  2,
  2,
  4,
  0,
  0,
  2,
  4,
  4,
  4,
  4,
  2,
  3,
  2,
  3],
 [0,
  0,
  1,
  1,
  0,
  2,
  0,
  0,
  0,
  5,
  6,
  7,
  0,
  0,
  1,
  0,
  0,
  2,
  2,
  4,
  2,
  2,
  2,
  2,
  4,
  8,
  4,
  4,
  2,
  4,
  4,
  2,
  2,
  2,
  4,
  2,
  2,
  4,
  2,
  2,
  2,
  2,
  4,
  9],
 [6,
  0,
  0,
  1,
  0,
  0,
  1,
  2,
  0,
  0,
  1,
  0,
  6,
  6,
  10,
  6,
  2,
  4,
  4,
  2,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  6,
  7,
  6,
  10,
  0,
  2,
  2,
  2,
  2,
  0,
  5,
  0,
  0,
  0,
  5,
  7,
  1,
  1,
  6,
  7,
  2,
  2],
 [6,
  0,
  0,
  0,
  7,
  0,
  5,
  0,
  1,
  0,
  0,
  0,
  4,
  2,
  2,
  4,
  2,
  2,
  6,
  11,
  3,
  11,
  4,
  2,
  2,
  0,
  6,
  2,
  3],
 [0,
  2,
  3,
  1,
  0,
  0,
  0,
  0,
  0,
  6,
  6,
  7,
  6,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  6,
  6,
  1,
  0,
  2,
  2,
  2,
  2,
  4,
  12,
  3,
  2,
  2,
  2,
  2,
  2,
 

# all sequence

In [13]:
x_df = pd.DataFrame(categ_index_list).iloc[:,10:30]
xy_df = pd.concat([x_df,y_raw_df],axis = 1)
df = xy_df.dropna()
x_df = df.iloc[:,:-1]
y_df = df.iloc[:,-1]
print(df.shape)
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, random_state = 404)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1453, 21)
(1089, 20)
(364, 20)
(1089,)
(364,)


In [14]:
# warnings.filterwarnings('ignore')
# Baseline - comparing model accuracy using all features across classifiers 
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    GaussianNB(),
    LogisticRegression()
    ]


# Train Accuracy
algo_train = []
scores_train = []
for clf in classifiers:
    algo_train.append(clf.__class__.__name__)
    scores_train.append(cross_val_score(clf, x_df, y_df.astype('int'), cv=5).mean())
# warnings.filterwarnings('ignore')
score_df_Train = pd.DataFrame({'Algorithm': algo_train, 'Score': scores_train}).set_index('Algorithm')


# Test Accuracy
algo_test = []
scores_test = []

for clf in classifiers:
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    algo_test.append(clf.__class__.__name__)
    scores_test.append(accuracy_score(y_test, y_pred))
# warnings.filterwarnings('ignore')
score_df_Test  = pd.DataFrame({'Algorithm': algo_test, 'Score': scores_test}).set_index('Algorithm')

# # Bar plot between Train and Test Accuracy
# fig = plt.figure(figsize=(10,7)) # Create matplotlib figure
# ax = fig.add_subplot(111) # Create matplotlib axes
# ax2 = ax.twinx() # Create another axes that shares the same x-axis as a
# width = .4

# score_df_Train.Score.plot(kind='bar',color='green',ax=ax,width=width, position=0)
# score_df_Test.Score.plot(kind='bar',color='red', ax=ax2,width = width,position=1)
# ax.grid(None, axis='both')
# ax2.grid(None)
# ax.set_ylabel('Train')
# ax2.set_ylabel('Test')
# ax.set_xlim(-1,7)

# scores_train_round = np.round(scores_train,3)
# scores_test_round = np.round(scores_test,3)
# for index,data in enumerate(scores_train_round):
#     plt.text(x=index-.4 , y =data+.02 , s=f"{data}" , fontdict=dict(fontsize=10), color='red', fontweight='bold')
# for index,data in enumerate(scores_test_round):
#     plt.text(x=index , y =data+.03 , s=f"{data}" , fontdict=dict(fontsize=10), color='green', fontweight='bold')

# plt.show()

# Table
score_df_Test_table  = pd.DataFrame({'Algorithm': algo_test, 'Score': scores_test})
result_df = score_df_Test_table.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df


Unnamed: 0_level_0,Algorithm
Score,Unnamed: 1_level_1
0.60989,SVC
0.601648,RandomForestClassifier
0.601648,LogisticRegression
0.593407,GaussianNB
0.521978,DecisionTreeClassifier
0.516484,KNeighborsClassifier


# XGboost
https://ithelp.ithome.com.tw/articles/10268984

In [15]:
#from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

def score(m, x_train, y_train, x_test, y_test, train=True):
    if train:
        pred=m.predict(x_train)
        print('Train Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_train, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_train, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_train, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_train, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_train, pred)}")
    elif train == False:
        pred=m.predict(x_test)
        print('Test Result:\n')
        print(f"Accuracy Score: {accuracy_score(y_test, pred)*100:.2f}%")
        print(f"Precision Score: {precision_score(y_test, pred)*100:.2f}%")
        print(f"Recall Score: {recall_score(y_test, pred)*100:.2f}%")
        print(f"F1 score: {f1_score(y_test, pred)*100:.2f}%")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, pred)}")


In [16]:
from xgboost import XGBClassifier
xg1 = XGBClassifier()
xg1=xg1.fit(x_train, y_train)

score(xg1, x_train, y_train, x_test, y_test, train=False)



Test Result:

Accuracy Score: 51.37%
Precision Score: 37.23%
Recall Score: 35.92%
F1 score: 36.56%
Confusion Matrix:
 [[136  86]
 [ 91  51]]


## tuning RandomizedSearchCV

In [17]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
learning_rate=[round(float(x),2) for x in np.linspace(start=0.01, stop=0.2, num=10)]
colsample_bytree =[round(float(x),2) for x in np.linspace(start=0.1, stop=1, num=10)]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'learning_rate': learning_rate,
               'colsample_bytree': colsample_bytree}
random_grid 

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09, 0.12, 0.14, 0.16, 0.18, 0.2],
 'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}

In [18]:
xg4 = XGBClassifier(random_state=42)

#Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores
xg_random = RandomizedSearchCV(estimator = xg4, param_distributions=random_grid,
                              n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

xg_random.fit(x_train,y_train)
xg_random.best_params_
# 9min 24sec
# total size:(1453, 21)
# train size:(1089, 20)
# test size:(364, 20)

Fitting 3 folds for each of 100 candidates, totalling 300 fits






{'n_estimators': 400,
 'max_depth': 20,
 'learning_rate': 0.05,
 'colsample_bytree': 0.5}

In [19]:
xg5 = XGBClassifier(colsample_bytree= 0.5, learning_rate=0.05, max_depth= 20, n_estimators=400)
xg5=xg5.fit(x_train, y_train)
score(xg5, x_train, y_train, x_test, y_test, train=False)

Test Result:

Accuracy Score: 53.85%
Precision Score: 38.60%
Recall Score: 30.99%
F1 score: 34.38%
Confusion Matrix:
 [[152  70]
 [ 98  44]]


---
# recommendation system

## TF-IDF
https://medium.com/qiubingcheng/%E4%BB%A5python%E5%AF%A6%E4%BD%9Ctf-idf%E7%AE%97%E6%B3%95-%E4%B8%A6%E4%BB%A5%E6%96%87%E5%AD%97%E9%9B%B2%E5%91%88%E7%8F%BE-7c6698b42025
1. 詞頻(term frequency，tf)
   1. 某一個給定的詞語在該文本中出現的頻率
   2. 該語詞在文本的出現次數/文本中所有語詞的出現次數之合
   3. 防止偏向較長的文本
2. 逆向文件頻率(inverse document frequency，idf)
   1. 一個詞語普遍重要程度
   2. log(文本的數量/包含該語詞的文本數量)
   3. 如果該詞在其他文本很少見到，表示越關鍵

In [81]:
# 統計 每個 語詞 的 次數
words_count = [] 
for file in x_raw_list:
  count = {}
  for word in file:
     if word in count:
        count[word] += 1
     else:
        count[word] = 1
  words_count.append(count)

In [106]:
ttt = words_count[0]
ttt["Q"] = 5
ttt
sum(ttt.values())

29

In [110]:
# 統計 每個 語詞 的 frequency
words_frequency = [] 
for dic in words_count:
   frequency = {}
   for word in dic:
      frequency[word] = dic[word]/sum(dic.values())
   words_frequency.append(frequency)

words_frequency

[{'Q': 0.1724137931034483,
  'QQ': 0.10344827586206896,
  'C': 0.41379310344827586,
  'CCC': 0.10344827586206896,
  'CC': 0.20689655172413793},
 {'Q': 0.22727272727272727,
  'QQ': 0.06818181818181818,
  'C': 0.38636363636363635,
  'QQQ': 0.022727272727272728,
  'P': 0.022727272727272728,
  'PP': 0.022727272727272728,
  'CC': 0.20454545454545456,
  'CCCCCCC': 0.022727272727272728,
  'QQQQQQ': 0.022727272727272728},
 {'P': 0.14,
  'Q': 0.38,
  'QQ': 0.12,
  'C': 0.18,
  'PPPP': 0.04,
  'CC': 0.04,
  'PP': 0.06,
  'QQQ': 0.04},
 {'P': 0.10344827586206896,
  'Q': 0.3103448275862069,
  'PP': 0.034482758620689655,
  'QQQ': 0.034482758620689655,
  'QQ': 0.034482758620689655,
  'CC': 0.10344827586206896,
  'C': 0.2413793103448276,
  'CCCC': 0.06896551724137931,
  'CCC': 0.06896551724137931},
 {'Q': 0.35,
  'C': 0.275,
  'CCC': 0.075,
  'QQ': 0.1,
  'P': 0.125,
  'PP': 0.025,
  'CC': 0.025,
  'PPP': 0.025},
 {'Q': 0.3157894736842105,
  'QQ': 0.05263157894736842,
  'PP': 0.15789473684210525,
  '

In [77]:
#idf
# 先 取得 每個 語詞 在 歌手 歌詞 中 " 出現 過 " 的 次數
all_words = []
for word in words_count:
    all_words.extend(list(word.keys()))

occurrences_of_word = {}
for word in all_words:
    if word in occurrences_of_word:
        occurrences_of_word[word] += 1
    else:
        occurrences_of_word[word] = 1 

inverse_document_frequency = []
for word_count in words_count:
    # 出現 過 的 次數
    invFre = {} 
    for word in word_count.keys():
        occurrences = occurrences_of_word[word] 
        invFre[word] = math.log(round((len(words_count)/occurrences),4))
    inverse_document_frequency.append(invFre)

In [111]:
##tf*idf
all_tf_idf = []
for i,words in enumerate(words_frequency):
    tf_idf = {}
    for word, freq in words.items():
        tf_idf[word] = freq*inverse_document_frequency[i][word] 
    all_tf_idf.append(tf_idf)

In [112]:
all_tf_idf

[{'Q': 0.012035495559303956,
  'QQ': 0.012514957919265796,
  'C': 0.020661669826845076,
  'CCC': 0.05783807275147039,
  'CC': 0.0206014768750536},
 {'Q': 0.015864971419082486,
  'QQ': 0.008248494992243364,
  'C': 0.019292051561807994,
  'QQQ': 0.01339033159493107,
  'P': 0.003838603101404861,
  'PP': 0.0042890391886180115,
  'CC': 0.020367369183291628,
  'CCCCCCC': 0.07668565248350537,
  'QQQQQQ': 0.07842518002771719},
 {'P': 0.023645795104653946,
  'Q': 0.026526232212705916,
  'QQ': 0.014517351186348323,
  'C': 0.008987826374677607,
  'PPPP': 0.04427776595150583,
  'CC': 0.003982952195843696,
  'PP': 0.01132306345795155,
  'QQQ': 0.023566983607078682},
 {'P': 0.017472262392601436,
  'Q': 0.02166389200674712,
  'PP': 0.006507507734454914,
  'QQQ': 0.020316365178516104,
  'QQ': 0.004171652639755265,
  'CC': 0.0103007384375268,
  'C': 0.012052640732326293,
  'CCCC': 0.09655852704112329,
  'CCC': 0.038558715167646924},
 {'Q': 0.024432055985387027,
  'C': 0.013731401405757456,
  'CCC': 0.0

In [122]:
max_dic = {}
for dic in all_tf_idf:
    max_dic[max(dic)] = dic[max(dic)]
max(max_dic)

'QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ'

0.012035495559303956

In [123]:
# 統計 每個 語詞 的 frequency
words_frequency = [] 
for dic in words_count:
   frequency = {}
   for word in dic:
      frequency[word] = dic[word]/sum(dic.values())
   words_frequency.append(frequency)

words_frequency

[{'Q': 0.1724137931034483,
  'QQ': 0.10344827586206896,
  'C': 0.41379310344827586,
  'CCC': 0.10344827586206896,
  'CC': 0.20689655172413793},
 {'Q': 0.22727272727272727,
  'QQ': 0.06818181818181818,
  'C': 0.38636363636363635,
  'QQQ': 0.022727272727272728,
  'P': 0.022727272727272728,
  'PP': 0.022727272727272728,
  'CC': 0.20454545454545456,
  'CCCCCCC': 0.022727272727272728,
  'QQQQQQ': 0.022727272727272728},
 {'P': 0.14,
  'Q': 0.38,
  'QQ': 0.12,
  'C': 0.18,
  'PPPP': 0.04,
  'CC': 0.04,
  'PP': 0.06,
  'QQQ': 0.04},
 {'P': 0.10344827586206896,
  'Q': 0.3103448275862069,
  'PP': 0.034482758620689655,
  'QQQ': 0.034482758620689655,
  'QQ': 0.034482758620689655,
  'CC': 0.10344827586206896,
  'C': 0.2413793103448276,
  'CCCC': 0.06896551724137931,
  'CCC': 0.06896551724137931},
 {'Q': 0.35,
  'C': 0.275,
  'CCC': 0.075,
  'QQ': 0.1,
  'P': 0.125,
  'PP': 0.025,
  'CC': 0.025,
  'PPP': 0.025},
 {'Q': 0.3157894736842105,
  'QQ': 0.05263157894736842,
  'PP': 0.15789473684210525,
  '

In [124]:
max(words_frequency)

TypeError: '>' not supported between instances of 'dict' and 'dict'