# week6_hw PCA and K-means

In [1]:
import jieba
import os
import pandas as pd
import func
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import timedelta, date
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
import matplotlib.pyplot as plt
import plotly
plotly.tools.set_credentials_file(username='mnbvv', api_key='P8WyPDldauQXVLtxYecA')
import plotly.plotly as py
import plotly.graph_objs as go

In [3]:
def is_chinese(uchar):         
    if u'\u4e00' <= uchar<=u'\u9fff':
        return True
    else:
        return False

In [4]:
# 專有名詞
jieba.load_userdict('dict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 1.049 seconds.
Prefix dict has been built succesfully.


In [5]:
folders = ["金融業/"]

In [6]:
def get_txts(folder):
    all_files = os.listdir("KAM/" + folder)   # imagine you're one directory above test dir
    return all_files

In [7]:
# 讀取新聞檔案，並使用jieba切詞
def textMining(folder, file):
    with open("./KAM/" + folder + file, 'rt',encoding="utf-8") as txt:
        data = txt.read()
    #text segmentation
    seg_list = jieba.cut(data, cut_all=False)
    seg_list = list(seg_list)
    filter_list = []
    wordDict = {}
    jump = False
    for word in seg_list:
        for s in word:
            if not is_chinese(s):
                jump = True
                break
        if not jump:
            filter_list.append(word)
        jump = False
    return filter_list

In [8]:
def analyze(folder, corpus, txts_in_folder):
    # tfidf
    vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 0.2)
    tfidf = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names()
    print("tfidf.shape: ", tfidf.shape)
    for i in range(len(corpus)):
        print('----{0} KAM----'.format(txts_in_folder[i]))
        for j in range(len(words)):
            if tfidf[i,j] > 0.2:
                print(words[j], tfidf[i,j], i ,j)
    
    X = tfidf.toarray()
    len(X)
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = sklearnPCA(n_components = 2)
    Y_sklearn = sklearn_pca.fit_transform(X_std)
    Y_sklearn[:3]
    
    return Y_sklearn

In [9]:
def plot_cpa_scatter(Y_sklearn, txts_in_folder):
    pca_data = [
        go.Scatter(
            x = Y_sklearn[:,0],
            y = Y_sklearn[:,1],
            mode = "markers",
            hoverinfo = 'text',
            text = txts_in_folder
        )
    ]

    fig = go.Figure(data = pca_data)
    return py.iplot(fig, filename = 'PCA Scatter Chart')

In [10]:
def plot_kmeans_scatter(Y_sklearn, txts_in_folder):
    kmeans = KMeans(n_clusters = 3)
    X_clustered = kmeans.fit_predict(Y_sklearn)
    
    #Define our own color map
    LABEL_COLOR_MAP = {0:'red', 1: 'green', 2: 'blue'}
    label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
    
    pca_data = [
        go.Scatter(
            x = Y_sklearn[:,0],
            y = Y_sklearn[:,1],
            mode = "markers",
            hoverinfo = 'text',
            text = txts_in_folder,
            marker = dict(color = label_color)
        )
    ]

    fig = go.Figure(data = pca_data)
    return py.iplot(fig, filename = 'K-means Scatter Chart')

###  分析大同20180102資料夾

In [11]:
# get corpus from folder 大同20180102
txts_in_folder1 = get_txts(folders[0])
corpus1 = []

for file_name in txts_in_folder1:
    filter_list = textMining(folders[0], file_name)
    join_list = " ".join(filter_list)
    corpus1.append(join_list)

Y_sklearn1 = analyze(folders[0], corpus1, txts_in_folder1)

tfidf.shape:  (134, 197)
----1052801_KAM.txt KAM----
主要 0.2389888164698055 0 5
應收款 0.3348157504550433 0 83
放款 0.4713003605164304 0 102
正確性 0.2548511326610237 0 123
----1052807_KAM.txt KAM----
公司 0.31386754756753027 1 30
評價 0.3319434973597434 1 162
風險 0.48918204347614125 1 194
----1052809_KAM.txt KAM----
及其 0.23206419837259115 2 50
商業 0.217726943306282 2 59
放款 0.22229691913650554 2 102
評價 0.2866043518242076 2 162
資產 0.20825158962883802 2 171
銀行 0.27312093233977136 2 189
----1052812_KAM.txt KAM----
及其 0.347674402346741 3 50
商業 0.39143350203769295 3 59
子公司 0.28912736919023685 3 73
授信 0.2641852863236663 3 89
放款 0.2561855696242036 3 102
現及 0.23523737986075058 3 138
貼現 0.312936306225151 3 169
銀行 0.3507298814682406 3 189
----1052834_KAM.txt KAM----
報告 0.2901801200059702 4 68
放款 0.4345248203634508 4 102
減損 0.3024753059325667 4 130
組合 0.27939688409406294 4 149
負債 0.2209999614127099 4 168
銀行 0.2002015331399415 4 189
----1052836_KAM.txt KAM----
及其 0.2465306495691111 5 50
合併 0.21049645780417686 5 

評價 0.5635258787990123 60 162
----105證券康和證.txt KAM----
仟元 0.22937402651374109 61 13
分別 0.4390532914565438 61 36
合併 0.2623577177313719 61 55
報告 0.3369624788351609 61 68
收入 0.39080697843027623 61 99
精算 0.2270725211312387 61 148
計算 0.21469702135916413 61 161
----105證券權益期.txt KAM----
報告 0.35903669214205103 62 68
收入 0.7634156074555328 62 99
認列 0.24253325201426504 62 163
----105證券福邦證.txt KAM----
交易 0.3468454086685109 63 12
參閱 0.2025791960296163 63 49
合併 0.3583765561816922 63 55
報表 0.4602260667679875 63 70
樣本 0.22850153062142678 63 120
計算 0.2346182298046845 63 161
----105證券統一證.txt KAM----
可回收 0.2654874072632666 64 52
合併 0.2194509953927004 64 55
商譽 0.2789556248154826 64 60
報表 0.20129877900730062 64 70
投資 0.2659110153391548 64 85
折現 0.27202333960909164 64 86
現金流量 0.22126081263627848 64 139
金額 0.20821459694327846 64 187
----105證券群益證.txt KAM----
價值 0.2306428734752171 65 26
公允 0.2910577230372909 65 29
合併 0.2051068705827592 65 55
商譽 0.34762941120393803 65 60
報告 0.29270180396494727 65 68
負債 0.2860812

測試 0.21218864280497832 117 133
準備 0.4417555214532971 117 134
精算 0.31863476748983877 117 148
----106保險新產.txt KAM----
保險 0.4506631799736838 118 21
公允 0.2544222333424146 118 29
準備 0.30205031051406206 118 134
精算 0.45518537149228044 118 148
負債 0.2917508556562282 118 168
金融工具 0.2176580539810347 118 186
----106保險旺旺保.txt KAM----
保險 0.5755422593440428 119 21
合併 0.25186856459786267 119 55
報告 0.2875472011654911 119 68
投資 0.20346114364282633 119 85
負債 0.4311458972790035 119 168
----106保險泰安產險.txt KAM----
保險 0.43001542528848224 120 21
收入 0.46719479949754034 120 99
準備 0.32938452486668235 120 134
認列 0.21769060039061383 120 163
賠款 0.4983761138804319 120 173
----106保險第一保.txt KAM----
保險 0.2850004788436348 121 21
準備 0.4297890212750617 121 134
精算 0.2878603236356986 121 148
賠款 0.46449528490888775 121 173
----106保險華南保.txt KAM----
保險 0.46120240472965157 122 21
收入 0.38972750648713644 122 99
準備 0.3532732223439068 122 134
認列 0.23347866723580504 122 163
賠款 0.4677058486305807 122 173
----106保險遠壽.txt KAM----
保險 0.5

In [12]:
plot_cpa_scatter(Y_sklearn1, txts_in_folder1)

In [13]:
plot_kmeans_scatter(Y_sklearn1, txts_in_folder1)