In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
device = torch.device("cpu")
import warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.special import softmax
import sklearn
from sklearn.metrics import log_loss, f1_score

def seed_all(seed_value):
    random.seed(seed_value) 
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value) 
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) 
        torch.backends.cudnn.deterministic = True  
        torch.backends.cudnn.benchmark = False

seed_all(79)

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1660


In [2]:
evidence = pd.read_csv('evidence.csv',sep='\t')
fact_checking_train = pd.read_csv('fact_checking_train.csv',sep='\t')
fact_checking_test = pd.read_csv('fact_checking_test.csv',sep='\t')
fact_checking_test['label']=0
# label_2_v = {'false':0,'half-true':1,'barely-true':2,'mostly-true':3,'true':4,'pants-fire':5}
label_2_v = {'pants-fire':0,'false':1,'barely-true':2,'half-true':3,'mostly-true':4,'true':5}
fact_checking_train['label'] = fact_checking_train['label'].map(label_2_v)
# Preparing train data
train = fact_checking_train[['claim','label']].copy()
train.columns = ["text", "labels"]

test = fact_checking_test[['claim','label']].copy()
test.columns = ["text", "labels"]

In [3]:
temp = fact_checking_train.groupby('author')['label'].value_counts()
author = set(fact_checking_train['author'].values)
author_label = {}
for a in author:
    counts = []
    for i in range(6):
        try:
            count = temp[a][i]
        except:
            count = 0
        counts.append(count)
    author_label[a] = counts/sum(counts)
author_label = pd.DataFrame.from_dict(author_label, orient='index')

result = fact_checking_train.join(author_label, on='author')
train_x_label = result[[0,1,2,3,4,5]].values
result = fact_checking_test.join(author_label, on='author')
# 缺失值
count = result[0].isna().sum()
print (count)
result = result.fillna(0)
test_x_label = result[[0,1,2,3,4,5]].values

257


In [4]:
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(3, 5) )
X = vec.fit_transform(train['text'])
model = Ridge(alpha = 1.0)
model.fit(X, train['labels'])

Ridge()

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.decomposition import TruncatedSVD
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=79)
err=[]
y_pred_tot=[]
for train_index, test_index in kf.split(train, train['labels']):
    train1_trn, train1_val = train.iloc[train_index], train.iloc[test_index]
    # training data
    X = vec.fit_transform(train1_trn['text'])
    svdT = TruncatedSVD(n_components=128,random_state=2021)
    svdT.fit(X)
    X = svdT.transform(X)
    X = np.hstack([X,train_x_label[train_index]])
    # val data 
    X_test = vec.transform(train1_val['text'])
    X_test = svdT.transform(X_test)
    X_test = np.hstack([X_test,train_x_label[test_index]])
    # test data 
    test_data = vec.transform(test['text'])
    test_data = svdT.transform(test_data)
    test_data = np.hstack([test_data,test_x_label])
    
    
#     model = CatBoostClassifier(
#         iterations=10000,
#         learning_rate=0.07,
# #         l2_leaf_reg=50,
#         task_type="GPU",
#         loss_function="MultiClass",
# #         logging_level='Verbose',
#         eval_metric='Accuracy'
#     )    
#     model.fit(X, train1_trn['labels'], eval_set=[(X_test, train1_val['labels'])], early_stopping_rounds=100)
    
    clf = LogisticRegression(random_state=79).fit(X, train1_trn['labels'])
    y_pred = clf.predict(X_test)
    acc = accuracy_score(train1_val['labels'], y_pred)
    print(acc)
    err.append(acc)
    raw_outputs = clf.predict_proba(test_data)
    y_pred_tot.append(raw_outputs)
#     break
print("Mean f1 score: ",np.mean(err))

0.5155555555555555
0.5055555555555555
0.5111111111111111
0.5172222222222222
0.5155555555555555
0.525
0.52
0.5083333333333333
0.4988888888888889
0.5044444444444445
Mean f1 score:  0.5121666666666667


In [6]:
def softmax(x):
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        exp_minmax = lambda x: np.exp(x - np.max(x))
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax,1,x)
        denominator = np.apply_along_axis(denom,1,x) 
        
        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0],1))
        
        x = x * denominator
    else:
        # Vector
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator =  1.0 / np.sum(numerator)
        x = numerator.dot(denominator)
    
    assert x.shape == orig_shape
    return x
sum(softmax(y_pred_tot[0])[0])

1.0

In [7]:
for i in range(10):
    if i == 0:
        res = softmax(y_pred_tot[i])/10
    else:
        res += softmax(y_pred_tot[i])/10
res = np.argmax(res, axis=1)
sub = pd.read_csv('submission.csv', sep='\t')
sub['label'] = res
v_2_label = {0:'pants-fire',1:'false',2:'barely-true',3:'half-true',4:'mostly-true',5:'true'}
sub['label'] = sub['label'].map(v_2_label)
sub.to_csv('sub_test.csv',index=None,sep='\t')

In [8]:
subtest =  pd.read_csv('./sub_test.csv', sep='\t')
subtest

Unnamed: 0,ID,label
0,18000,barely-true
1,18001,false
2,18002,barely-true
3,18003,false
4,18004,false
...,...,...
1788,19788,barely-true
1789,19789,pants-fire
1790,19790,true
1791,19791,false


In [9]:
# temp = fact_checking_train.groupby('author')['label'].value_counts()
# author = set(fact_checking_train['author'].values)
# author_label = {}
# for a in author:
#     counts = []
#     for i in range(6):
#         try:
#             count = temp[a][i]
#         except:
#             count = 0
#         counts.append(count)
#     author_label[a] = counts/sum(counts)
# author_label = pd.DataFrame.from_dict(author_label, orient='index')

# result = fact_checking_train.join(author_label, on='author')

# result = fact_checking_test.join(author_label, on='author')
# result = result.fillna(0)