In [1]:
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score
from data_process import DataProcess
from data_process import GetUsefulLevel
import os
from imblearn.over_sampling import SMOTE

  from pandas import MultiIndex, Int64Index
  from pandas import Int64Index as NumericIndex


In [2]:
def print_precison_recall_f1(y_true, y_pre):
    """打印精准率、召回率和F1值"""
    score=dict()
    print(classification_report(y_true, y_pre))
    f1_mac = round(f1_score(y_true, y_pre, average='macro'), 5)
    p_mac = round(precision_score(y_true, y_pre, average='macro'), 5)
    r_mac = round(recall_score(y_true, y_pre, average='macro'), 5)
    
    print("Precision_mac: {}, Recall_mac: {}, F1_mac: {} ".format(p_mac, r_mac, f1_mac))

    f1_mic = round(f1_score(y_true, y_pre, average='micro'), 5)
    p_mic = round(precision_score(y_true, y_pre, average='micro'), 5)
    r_mic = round(recall_score(y_true, y_pre, average='micro'), 5)
    print("Precision_mic: {}, Recall_mic: {}, F1_mic: {} ".format(p_mic, r_mic, f1_mic))
    score={'macro':{'f1_mac':f1_mac,'p_mac':p_mac,'r_mac':r_mac},'micro':{'f1_mic':f1_mic,'p_mic':p_mic,'r_mic':r_mic}}
    return score

def get_data(path):
    df = pd.read_csv(path)
    df.dropna(axis=0, how='any', inplace=True)
    data_process = DataProcess()
    df['reviewComment'] = df['reviewComment'].map(data_process.text_replace)  # 替换评论数据中的乱码字符
    df['reviewComment'] = df['reviewComment'].map(data_process.sen_analy)  # 获取评论数据情感倾向
    df['date'] = df['date'].map(data_process.time_format)  # 获取年份数据
    df.drop(labels=None, axis=1, index=None, columns='recordId', inplace=True)

    useful_level = GetUsefulLevel(df['usefulCount'])
    df['usefulCount'] = df['usefulCount'].map(useful_level.get_current_level)  # 获取count的分级，分为10级
    df['drugName'] = data_process.target_encoder(df['drugName'], df['rating'])  # 对drugName进行目标编码
    df['condition'] = data_process.target_encoder(df['condition'], df['rating'])  # 对condition进行目标编码
    df['sideEffects'] = data_process.side_effect_level(df['sideEffects'])  # 对sideEffects划分等级
    return df

def xgb_train(x_train,y_train):
    xgboost_clf = XGBClassifier(min_child_weight=6, max_depth=10, early_stopping_rounds=8,
                               objective='multi:softmax', num_class=5)
    xgboost_clf.fit(x_train, y_train)
    # feature_importances_ = xgboost_clf.feature_importances_
    return xgboost_clf

In [3]:
path=r'D:\Python\drug_review_predict\dataset'
train_path=os.path.join(path,'training.csv')
val_path = os.path.join(path, 'validation.csv')
train_data=get_data(train_path)    

dataprocess:


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [4]:
train_data 

Unnamed: 0,drugName,condition,reviewComment,date,usefulCount,sideEffects,rating
0,3.964286,3.789227,0.000000,2012,13,1,5
1,4.249512,3.444535,0.566667,2009,11,3,4
2,3.562500,2.839286,0.333333,2017,4,0,5
3,4.000000,4.329670,0.139063,2017,16,1,5
4,2.791563,3.287747,0.260926,2015,4,3,5
...,...,...,...,...,...,...,...
6994,3.714429,3.501435,0.143056,2016,9,1,5
6995,4.266667,3.085162,0.133929,2017,1,0,4
6996,3.111111,4.100000,0.089464,2016,19,1,5
6997,4.024390,3.814815,0.076042,2017,2,0,1


In [5]:
train_data['reviewComment'].value_counts()

 0.000000    314
 0.250000     59
 0.500000     51
 0.200000     50
 0.100000     38
            ... 
 0.074286      1
-0.091443      1
 0.158586      1
-0.082407      1
 0.076042      1
Name: reviewComment, Length: 4753, dtype: int64

In [8]:
def comment2num(comment):
    if -1<= comment<-0.1:
        return -1
    elif -0.1<=comment<0.1:
        return 0
    else:
        return 1

In [9]:
train_data['reviewComment']=train_data['reviewComment'].map(comment2num)

In [10]:
train_data['reviewComment'].value_counts()

 0    2952
 1    2864
-1    1149
Name: reviewComment, dtype: int64

In [11]:
train_data

Unnamed: 0,drugName,condition,reviewComment,date,usefulCount,sideEffects,rating
0,3.964286,3.789227,0,2012,13,1,5
1,4.249512,3.444535,1,2009,11,3,4
2,3.562500,2.839286,1,2017,4,0,5
3,4.000000,4.329670,1,2017,16,1,5
4,2.791563,3.287747,1,2015,4,3,5
...,...,...,...,...,...,...,...
6994,3.714429,3.501435,1,2016,9,1,5
6995,4.266667,3.085162,1,2017,1,0,4
6996,3.111111,4.100000,0,2016,19,1,5
6997,4.024390,3.814815,0,2017,2,0,1


In [12]:
x_new_data=pd.DataFrame(columns=train_data.columns[:-1])
y_new_data = pd.DataFrame(columns=[train_data.columns[-1]])
smo = SMOTE(random_state=0)

In [88]:
x_new_data

Unnamed: 0,drugName,condition,reviewComment,date,usefulCount,sideEffects


In [89]:
y_new_data

Unnamed: 0,rating


In [13]:
for label in range(1,5):
    x_train=train_data[train_data['rating'].isin([label,5])].iloc[:,:-1]
    y_train=pd.DataFrame(train_data[train_data['rating'].isin([label,5])].iloc[:,-1],columns=['rating'])
    x_train_smo,y_train_smo=smo.fit_resample(x_train,y_train)
    x_new_data=x_new_data.append(x_train_smo[y_train_smo['rating']==label],ignore_index=True)
    y_new_data = y_new_data.append(y_train_smo[y_train_smo['rating']==label], ignore_index=True)

x_new_data=x_new_data.append(train_data[train_data['rating'] == 5].iloc[:, :-1], ignore_index=True)  # 将标签为5的数据合并
y_new_data=y_new_data.append(pd.DataFrame(train_data[train_data['rating'] == 5].iloc[:, -1], columns=['rating']),
                      ignore_index=True)  # 将标签为5的数据合并

  x_new_data=x_new_data.append(x_train_smo[y_train_smo['rating']==label],ignore_index=True)
  y_new_data = y_new_data.append(y_train_smo[y_train_smo['rating']==label], ignore_index=True)
  x_new_data=x_new_data.append(x_train_smo[y_train_smo['rating']==label],ignore_index=True)
  y_new_data = y_new_data.append(y_train_smo[y_train_smo['rating']==label], ignore_index=True)
  x_new_data=x_new_data.append(x_train_smo[y_train_smo['rating']==label],ignore_index=True)
  y_new_data = y_new_data.append(y_train_smo[y_train_smo['rating']==label], ignore_index=True)
  x_new_data=x_new_data.append(x_train_smo[y_train_smo['rating']==label],ignore_index=True)
  y_new_data = y_new_data.append(y_train_smo[y_train_smo['rating']==label], ignore_index=True)
  x_new_data=x_new_data.append(train_data[train_data['rating'] == 5].iloc[:, :-1], ignore_index=True)  # 将标签为5的数据合并
  y_new_data=y_new_data.append(pd.DataFrame(train_data[train_data['rating'] == 5].iloc[:, -1], columns=['rating']),


In [14]:
x_new_data

Unnamed: 0,drugName,condition,reviewComment,date,usefulCount,sideEffects
0,1.876676,3.287747,0,2011,4,4
1,3.972973,2.889166,-1,2017,11,2
2,4.333333,4.32967,0,2014,18,1
3,3.55,3.636364,0,2015,17,3
4,3.7,3.636364,1,2016,10,2
...,...,...,...,...,...,...
17055,4.266666,4.136364,0,2015,18,1
17056,4.055556,3.287747,1,2011,2,1
17057,4.5,4.344828,0,2010,3,2
17058,3.714429,3.501435,1,2016,9,1


In [15]:
y_new_data

Unnamed: 0,rating
0,1
1,1
2,1
3,1
4,1
...,...
17055,5
17056,5
17057,5
17058,5


array([[1.8766758142510356, 3.2877470355731226, -0.005888888888888877,
        2011, 4, 4],
       [3.972972972972973, 2.889165733969873, -0.16666666666666666, 2017,
        11, 2],
       [4.333333333269822, 4.329670329670329, 0.07499999999999998, 2014,
        18, 1],
       ...,
       [4.499999967477863, 4.344827586206462, -0.04243197278911565, 2010,
        3, 2],
       [3.714429289303661, 3.50143514326711, 0.14305555555555557, 2016,
        9, 1],
       [3.1111111360881067, 4.1, 0.08946428571428575, 2016, 19, 1]],
      dtype=object)

In [16]:
y_new_data['rating'].unique()

array([1, 2, 3, 4, 5], dtype=object)

In [17]:
xgb_model = xgb_train(x_new_data.values, y_new_data.values)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [18]:

train_pred = xgb_model.predict(x_new_data.values)
train_pred

array([1, 1, 5, ..., 5, 5, 5], dtype=object)

In [19]:
pred=train_pred.tolist()

In [20]:
y=y_new_data['rating'].values.tolist()

In [21]:
set(train_pred.tolist())

{1, 2, 3, 4, 5}

In [22]:
set(y_new_data['rating'].values.tolist())

{1, 2, 3, 4, 5}

In [23]:
print('Training score:')
train_score = print_precison_recall_f1(y, pred)

Training score:
              precision    recall  f1-score   support

           1       0.91      0.89      0.90      3412
           2       0.91      0.94      0.93      3412
           3       0.90      0.93      0.91      3412
           4       0.94      0.87      0.90      3412
           5       0.89      0.91      0.90      3412

    accuracy                           0.91     17060
   macro avg       0.91      0.91      0.91     17060
weighted avg       0.91      0.91      0.91     17060

Precision_mac: 0.90873, Recall_mac: 0.90803, F1_mac: 0.90791 
Precision_mic: 0.90803, Recall_mic: 0.90803, F1_mic: 0.90803 


In [24]:
val_data=get_data(val_path)
x_val=val_data.iloc[:,:-1]
y_val=val_data.iloc[:,-1]

dataprocess:


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [25]:
val_pred = xgb_model.predict(x_val)
val_pred=val_pred.tolist()
y_val=y_val.tolist()

In [26]:
print('Validation score:')
val_score = print_precison_recall_f1(y_val, val_pred)

Validation score:
              precision    recall  f1-score   support

           1       0.45      0.45      0.45       225
           2       0.08      0.23      0.12        82
           3       0.12      0.23      0.16       103
           4       0.21      0.28      0.24       192
           5       0.79      0.38      0.51       592

    accuracy                           0.35      1194
   macro avg       0.33      0.31      0.30      1194
weighted avg       0.53      0.35      0.40      1194

Precision_mac: 0.33143, Recall_mac: 0.31465, F1_mac: 0.29669 
Precision_mic: 0.35343, Recall_mic: 0.35343, F1_mic: 0.35343 
