In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
import matplotlib.pylab as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# df = pd.read_csv('../data/train/change_info.csv')
# value = 'bgxmdm'

In [3]:
# df[value] = df[value].astype(str)
# df[value].fillna('-1', inplace=True)
# group_df = df.groupby(['id']).apply(lambda x: x[value].tolist()).reset_index()
# group_df.columns = ['id', 'list']
# group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
# enc_vec = TfidfVectorizer()
# tfidf_vec = enc_vec.fit_transform(group_df['list'])

In [4]:
# svd_enc = TruncatedSVD(n_components=4, n_iter=20, random_state=2020)
# vec_svd = svd_enc.fit_transform(tfidf_vec)

In [5]:
# tfidf_vec

In [6]:
def gen_user_tfidf_features(df, value,n):
    df[value] = df[value].astype(str)
    df[value].fillna('-1', inplace=True)
    group_df = df.groupby(['id']).apply(lambda x: x[value].tolist()).reset_index()
    group_df.columns = ['id', 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = TfidfVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=n, n_iter=20, random_state=2020)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_tfidf_{}_{}'.format(value, i) for i in range(n)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df

def gen_user_countvec_features(df, value,n):
    df[value] = df[value].astype(str)
    df[value].fillna('-1', inplace=True)
    group_df = df.groupby(['id']).apply(lambda x: x[value].tolist()).reset_index()
    group_df.columns = ['id', 'list']
    group_df['list'] = group_df['list'].apply(lambda x: ','.join(x))
    enc_vec = CountVectorizer()
    tfidf_vec = enc_vec.fit_transform(group_df['list'])
    svd_enc = TruncatedSVD(n_components=n, n_iter=20, random_state=2020)
    vec_svd = svd_enc.fit_transform(tfidf_vec)
    vec_svd = pd.DataFrame(vec_svd)
    vec_svd.columns = ['svd_countvec_{}_{}'.format(value, i) for i in range(n)]
    group_df = pd.concat([group_df, vec_svd], axis=1)
    del group_df['list']
    return group_df

In [7]:
def df2_group_FUNDAM_features(df, value):
    group_df = df.pivot_table(index='id',
                              columns=value,
                              values='FUNDAM',
                              dropna=False,
                              aggfunc=['mean', 'count']).fillna(0)

    group_df.columns = ['id_{}_{}_FUNDAM_{}'.format(value, f[1], f[0]) for f in group_df.columns]
    group_df.reset_index(inplace=True)
    
    return group_df

In [8]:
def get_en_10(data,cols):
    train_data=data[~data.label.isnull()]
    test_data=data[data.label.isnull()]
    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    
    new_col = ['id']
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
    enc_list = cols
    for f in tqdm(enc_list):
        train_data[f + '_target_enc'] = 0
        test_data[f + '_target_enc'] = 0
        for i, (trn_idx, val_idx) in enumerate(skf.split(train_data, train_data['label'])):
            trn_x = train_data[[f, 'label']].iloc[trn_idx].reset_index(drop=True)
            val_x = train_data[[f]].iloc[val_idx].reset_index(drop=True)
            enc_df = trn_x.groupby(f, as_index=False)['label'].agg({f + '_target_enc': 'mean'})
            val_x = val_x.merge(enc_df, on=f, how='left')
            test_x = test_data[[f]].merge(enc_df, on=f, how='left')
            val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_data['label'].mean())
            test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_data['label'].mean())
            train_data.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
            test_data[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits
            new_col.append(f + '_target_enc')
            
    data=pd.concat([train_data,test_data],ignore_index=True)
    return data,new_col

In [9]:
# pd.read_csv('../data/train/tax_info.csv')

In [10]:
def deal_df3():
    df3 = pd.read_csv('../data/train/tax_info.csv')
    df_create = df3.groupby(['id'])['END_DATE'].count().reset_index()
    
#     df_create = df_create.merge(gen_user_tfidf_features(df=df3, value='TAX_CATEGORIES',n=3), on=['id'], how='left')
#     df_create = df_create.merge(gen_user_countvec_features(df=df3, value='TAX_CATEGORIES',n=3), on=['id'], how='left')
    
#     df_create = df_create.merge(gen_user_tfidf_features(df=df3, value='TAX_ITEMS',n=3), on=['id'], how='left')
#     df_create = df_create.merge(gen_user_countvec_features(df=df3, value='TAX_ITEMS',n=3), on=['id'], how='left')
    
    df_create['TAX_AMOUNT_sum'] = df3.groupby(['id'])['TAX_AMOUNT'].transform('sum')
    df_create['TAX_AMOUNT_mean'] = df3.groupby(['id'])['TAX_AMOUNT'].transform('mean')
    
    del df_create['END_DATE']
    return df_create

In [11]:
def deal_df4():
    df4 = pd.read_csv('../data/train/change_info.csv')
    df_create = df4.groupby(['id'])['bgrq'].count().reset_index()
    
    df_create = df_create.merge(gen_user_tfidf_features(df=df4, value='bgxmdm',n=5), on=['id'], how='left')
    df_create = df_create.merge(gen_user_countvec_features(df=df4, value='bgxmdm',n=5), on=['id'], how='left')
    
    df_create = df_create.merge(gen_user_tfidf_features(df=df4, value='bgq',n=5), on=['id'], how='left')
    df_create = df_create.merge(gen_user_countvec_features(df=df4, value='bgq',n=5), on=['id'], how='left')
    
    df_create = df_create.merge(gen_user_tfidf_features(df=df4, value='bgh',n=5), on=['id'], how='left')
    df_create = df_create.merge(gen_user_countvec_features(df=df4, value='bgh',n=5), on=['id'], how='left')
    
    del df_create['bgrq']
    return df_create

In [12]:
def deal_df5():
    df5 = pd.read_csv('../data/train/news_info.csv')
    df_create = df5.groupby(['id'])['public_date'].count().reset_index()
    
    df_create = df_create.merge(gen_user_tfidf_features(df=df5, value='positive_negtive',n=2), on=['id'], how='left')
    df_create = df_create.merge(gen_user_countvec_features(df=df5, value='positive_negtive',n=2), on=['id'], how='left')
    del df_create['public_date']
    return df_create

In [13]:
def deal_df2():
    df2 = pd.read_csv('../data/train/annual_report_info.csv')
#     label = pd.read_csv('../data/train/entprise_info.csv')
#     df2 = df2.merge(label,on=['id'],how='left')
    
    df_create = df2.groupby(['id'])['ANCHEYEAR'].count().reset_index()
    df_create.columns = ['id','all_year']
    
#     df_create = df_create.merge(gen_user_tfidf_features(df=df2, value='FUNDAM',n=4), on=['id'], how='left')
#     df_create = df_create.merge(gen_user_countvec_features(df=df2, value='FUNDAM',n=4), on=['id'], how='left')
    
#     cols = ['STATE','EMPNUMSIGN','WEBSITSIGN','FORINVESTSIGN','STOCKTRANSIGN','PUBSTATE','BUSSTNAME']
#     data,new_cols = get_en_10(df2,cols)
    
#     data = data[new_cols]
#     data.drop_duplicates(['id'],keep='first',inplace=True)
    
#     cols = ['STATE','EMPNUMSIGN','WEBSITSIGN','FORINVESTSIGN','STOCKTRANSIGN','PUBSTATE','BUSSTNAME','ANCHEYEAR']
#     for col in cols:
#         a = df2.groupby(['id'])[col].value_counts().unstack().fillna(0)
#         for i in a.columns:
#             df_create[col + '_'+str(i)+'_all']=df_create['id'].map(dict(a[i]))
#             df_create[col + '_'+str(i)+'_all'].fillna(0,inplace=True)
    
#     cols = ['FUNDAM','MEMNUM','FARNUM','ANNNEWMEMNUM','ANNREDMEMNUM','EMPNUM','COLGRANUM','RETSOLNUM','DISPERNUM',
#        'UNENUM','COLEMPLNUM','RETEMPLNUM','DISEMPLNUM','UNEEMPLNUM']
#     for col in cols:
#         df_create['mean_'+col]=df_create['id'].map(dict(df2.groupby(['id'])[col].mean()))

    del df_create['all_year']
    return df_create

In [14]:
def deal_df1():
    data = pd.read_csv('../data/train/base_info.csv')
#     data['opscope'] = data['opscope'].apply(lambda x: x.split('（依法须经批准的项目，经相关部门批准后方可开展经营活动）')[0])
    data['opfrom'] = pd.to_datetime(data['opfrom'])
    data['opto'] = pd.to_datetime(data['opto'])
    data['use_time'] = (data['opto'] - data['opfrom']).dt.days
    
#     data['rate'] = data['reccap'] / data['regcap']
    
    data['opform'].fillna('0',inplace=True)
    Dis_cols = ['oplocdistrict','industryphy','industryco','dom','enttype','enttypeitem','state',
           'orgid','jobid','opform','enttypeminu','protype','oploc','enttypegb']
    for f in tqdm(Dis_cols):
        le=LabelEncoder()
        data[f]=le.fit_transform(data[f])
        
#     data = data.merge(gen_user_tfidf_features(df=data, value='opscope',n=3), on=['id'], how='left')
#     data = data.merge(gen_user_countvec_features(df=data, value='opscope',n=3), on=['id'], how='left')
    
    data.drop(['opfrom','opto','ptbusscope','midpreindcode','opscope','parnum','congro',
              'forreccap'],axis=1,inplace=True)
    return data

In [15]:
def deal_df6():
    df6 = pd.read_csv('../data/train/other_info.csv')
    df6['legal_judgment_num'] = df6.groupby(['id'])['legal_judgment_num'].transform('mean')
    df6['brand_num'] = df6.groupby(['id'])['brand_num'].transform('mean')
#     df6['patent_num'] = df6.groupby(['id'])['patent_num'].transform('sum')
    df6.drop_duplicates(['id'],keep='first',inplace=True)
    return df6

In [16]:
df1 = deal_df1()
# df1_opscope = pd.read_pickle('opscope.pkl')
# df1 = pd.concat((df1,df1_opscope),axis=1)
df5 = deal_df5()
# df6 = deal_df6()
label = pd.read_csv('../data/train/entprise_info.csv')

100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 150.94it/s]


In [17]:
data = df1.merge(label,on=['id'],how='left')
data = data.merge(df5,on='id',how='left')
# data = data.merge(df6,on=['id'],how='left')

In [18]:
# df1['empnum'].fillna(df1['empnum'].median(),inplace=True)

In [19]:
# a = data[~data['exenum'].isin([np.nan])]

In [20]:
# a[a['label']==1][['industryphy','exenum']]

In [21]:
# a[a['label'] == 1]['industryphy'].value_counts()

In [22]:
# import seaborn as sns
# sns.scatterplot(x='industryphy',y='rate',data=data,hue='label')

In [23]:
# data.groupby(['label'])['exenum'].mean()

In [24]:
# df1['forregcap'].unique()

In [25]:
# df1.isnull().sum()

In [18]:
data.to_csv('fea_data/data.csv',index=0)

In [None]:
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
# enc_list = ['regcap']
# for f in tqdm(enc_list):
#     train_data[f + '_target_enc'] = 0
#     test_data[f + '_target_enc'] = 0
#     for i, (trn_idx, val_idx) in enumerate(skf.split(train_data, train_data['label'])):
#         trn_x = train_data[[f, 'label']].iloc[trn_idx].reset_index(drop=True)
#         val_x = train_data[[f]].iloc[val_idx].reset_index(drop=True)
#         enc_df = trn_x.groupby(f, as_index=False)['label'].agg({f + '_target_enc': 'mean'})
#         val_x = val_x.merge(enc_df, on=f, how='left')
#         test_x = test_data[[f]].merge(enc_df, on=f, how='left')
#         val_x[f + '_target_enc'] = val_x[f + '_target_enc'].fillna(train_data['label'].mean())
#         test_x[f + '_target_enc'] = test_x[f + '_target_enc'].fillna(train_data['label'].mean())
#         train_data.loc[val_idx, f + '_target_enc'] = val_x[f + '_target_enc'].values
#         test_data[f + '_target_enc'] += test_x[f + '_target_enc'].values / skf.n_splits

In [None]:
# def lgb_reg_train(pout_train, test_df, target):
#     folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=666)
#     oof = np.zeros(len(pout_train))
#     predictions = np.zeros(len(test_df))

#     for fold_, (trn_idx, val_idx) in enumerate(folds.split(pout_train.values, target)):
#         clf = xgb.XGBRegressor()
#         clf.fit(pout_train.values[trn_idx],target[trn_idx])
#         oof[val_idx] = clf.predict(pout_train.values[val_idx])
#         predictions += clf.predict(test_df.values) / folds.n_splits

#     return oof, predictions

# train_oof, test_pre = lgb_reg_train(train_data[fea], test_data[fea], train_data['label'])

# train_data['sign'] = train_oof
# test_data['sign'] = test_pre
# fea = [i for i in train_data.columns if i not in ['id','label']]