In [43]:
import pandas as pd
import json
import csv
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


#----------------读取字典数据，将疾病对应的所有症状都展开成行----------------#
def GetAllSymptom(source_path, target_path):
    with open(source_path, 'rb') as f_source:
        load_dict = json.load(f_source)
        with open(target_path, 'w', encoding='utf8') as f_target:
            writer = csv.writer(f_target)
            for key, values in load_dict.items():
                for item in values:
                    writer.writerow([key,item])
    columns = ['disease', 'symptom']
    data = pd.read_csv(target_path, names=columns)
    data.to_csv(target_path,index=False)        

    
#----------------将所有的症状转换为类别，数据向量化-----------------#
def DataToVector(source_path, target_path, test_path):
    data = pd.read_csv(source_path)
    df = pd.DataFrame(data)
    test_data = pd.read_csv(test_path)
    test_df = pd.DataFrame(test_data)
    df = pd.concat([df,test_df], axis=0, ignore_index=True)
    #print(df)

    #将所有症状转换为类别
    df_vector = pd.get_dummies(df.symptom)
    #print(df_vector)    #3055
    df_disease = df['disease']
    #print(df_disease)
    #将两张表合并为一张，行对齐，列合并
    df_pivoted = pd.concat([df_disease,df_vector], axis=1)
    #print(df_pivoted)
    #删除合并后表中的重复行，在源数据上修改
    df_pivoted.drop_duplicates(keep='first',inplace=True)
    df_pivoted = df_pivoted.groupby('disease').sum()
    df_pivoted = df_pivoted.reset_index()
    #print(df_pivoted)
    df_pivoted.to_csv(target_path)
    
    
#----------------测试数据向量化----------------#
def TestDataToVector(source_path,train_path,target_path):
    data = pd.read_csv(source_path)
    df = pd.DataFrame(data)
    len_test = len(df)
    #print(len_test)
    #print(df)
    train_data = pd.read_csv(train_path)
    train_df = pd.DataFrame(train_data)
    
    df = pd.concat([df,train_df], axis=0, ignore_index=True)
    #print(df)

    #将所有症状转换为类别
    df_vector = pd.get_dummies(df.symptom)
    #print(df_vector)
    df_disease = df['disease']
    #print(df_disease)
    #将两张表合并为一张，行对齐，列合并
    df_pivoted = pd.concat([df_disease,df_vector], axis=1)
    #print(df_pivoted)
    #选取测试数据
    df_pivoted = df_pivoted[:len_test]
    #print(df_pivoted)
    df_pivoted.to_csv(target_path)
     
        
        
#--------------套入模型训练-----------------#
def ModelToAchieve(source_path):
    data = pd.read_csv(source_path)
    df = pd.DataFrame(data)
    #print(df)
    columns = df.columns
    columns = columns[2:]
    #print(columns)  3043
    x = df[columns]
    #print(x)
    y = df['disease']
    
        #---------------处理测试数据--------------#
    GetAllSymptom("./test/test_disease.json","./test/data_clean.csv")
    TestDataToVector("./test/data_clean.csv", "./data/data_clean.csv", "./test/data_pivoted.csv")
    test_data = pd.read_csv("./test/data_pivoted.csv")
    test_df = pd.DataFrame(test_data)
    
    test_columns = test_df.columns
    test_columns = test_columns[2:]
    #print(test_columns)
    test_x = test_df[test_columns]
    #print(test_x)
    test_y = test_df['disease']
    #train_x = pd.concat([x,test_x], axis=1)
    '''
    #划分数据
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=142)
    '''
    '''
    #决策树算法
    from sklearn.tree import DecisionTreeClassifier, export_graphviz
    dt = DecisionTreeClassifier()
    clf_dt=dt.fit(x,y)
    '''
    mnb_tot = MultinomialNB()
    mnb_tot = mnb_tot.fit(x, y)
    disease_pred = mnb_tot.predict(test_x)
    print(disease_pred)
    print ("Acurracy: ", mnb_tot.score(test_x,test_y))
if __name__ == '__main__':
    #GetAllSymptom("./data/result_disease.json","./data/data_clean.csv")
    DataToVector("./data/data_clean.csv","./data/data_pivoted.csv","./test/data_clean.csv")
    ModelToAchieve("./data/data_pivoted.csv")

['乳腺增生' '乳腺增生' '乳管内乳头状瘤' '乳腺结核' '直肠炎' '克罗恩病' '直肠炎' '直肠炎' '淋菌性肛门直肠炎'
 '急性阑尾炎' '急性化脓性乳腺炎' '急性阑尾炎' '鹦鹉热' '上皮样肉瘤' '脑结核瘤' '脑结核瘤' '皮脂腺囊肿' '皮样囊肿'
 '儿童口吃' '儿童口吃' '儿童口吃' '儿童口吃']
Acurracy:  0.6363636363636364
