<a href="https://colab.research.google.com/github/nicostanw/NLP_Toxic_Comment_Classification/blob/main/NLP_Project_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import re
import string 
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch import nn
from torch import autograd
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import time
import copy
import tqdm.notebook as tq

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd drive/MyDrive/NLP project

/content/drive/MyDrive/NLP project


In [4]:
train = pd.read_csv('train.csv', sep=',')
print(train.shape)

(159571, 8)


In [5]:
train.head(15)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


# Data preprocessing

In [6]:
tokenizer = get_tokenizer('basic_english')

# Tokenization, on réduit les phrases en bouts simples

def clean_text(text):
  text = text.lower() #put in minuscule
  url_pattern = re.compile(r'https?://\S+|www\.\S+') #remove links 
  text = url_pattern.sub(r'', text)
  text = re.sub(r'\d+', '',text) #remove numbers
  translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #remove poncutation 
  text = text.translate(translator)
  text = re.sub(r'\n',' ',text) #remove the /n
  text = re.sub(' +', ' ', text) #remove big spaces
  return(text.strip())  

In [7]:
X = train["comment_text"].apply(clean_text)
X.head()

0    explanation why the edits made under my userna...
1    d aww he matches this background colour i m se...
2    hey man i m really not trying to edit war it s...
3    more i can t make any real suggestions on impr...
4    you sir are my hero any chance you remember wh...
Name: comment_text, dtype: object

In [8]:
y = train[train.columns[2:]].apply(lambda x:np.array(list(x)),axis=1)
print(y) 
print(len(y))
num_class = y[0].size
print(num_class)

0         [0, 0, 0, 0, 0, 0]
1         [0, 0, 0, 0, 0, 0]
2         [0, 0, 0, 0, 0, 0]
3         [0, 0, 0, 0, 0, 0]
4         [0, 0, 0, 0, 0, 0]
                 ...        
159566    [0, 0, 0, 0, 0, 0]
159567    [0, 0, 0, 0, 0, 0]
159568    [0, 0, 0, 0, 0, 0]
159569    [0, 0, 0, 0, 0, 0]
159570    [0, 0, 0, 0, 0, 0]
Length: 159571, dtype: object
159571
6
[15294  1595  8449   478  7877  1405]


# Construction du vocabulaire

In [9]:
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for text in X:
  counter.update(tokenizer(text))
vocab = Vocab(counter, min_freq=3) #Le vocab qui apparait au moins 3 fois

In [10]:
vocab_size = len(vocab)
print(vocab_size) #En tout on a 59 868 mots qui apparaissent au moins 3 fois

59868


In [11]:
max_len = X.map(lambda x:len(tokenizer(x))).max() #Le plus long texte 
print(max_len)
print(vocab['<pad>']) # Tous nos textes une fois finis seront complétés par '<pad>' jusqu'à atteindre le nombre maximal de mots 1403
# Plus présicément ce seront leurs formes vectorisées qui seront complétées par des 1

1403


In [12]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Construction du Dataloader Pytorch

In [13]:
class ToxicCommentDataset(Dataset):
    """Face Landmarks dataset"""

    def __init__(self, X,y):
        self.X = X
        self.y = y
        

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
       
        return (self.X.iloc[idx], self.y.iloc[idx])

# Fonction qui prend en entrée les éléments d'un batch et transforme chacun des ces éléments (des textes) par sa version vectorisée en accord avec le vocubalaire définit précédemment

In [None]:
def collate_batch(batch): 
    vectorized_seqs = []
    list_label = []
    for text, label in batch:
      vectorized_seqs.append(text_pipeline(text))
      list_label.append(label)
      
    seq_lengths = list(map(len, vectorized_seqs))
    
    seq_tensor = torch.ones(size=(len(batch),max_len), dtype=torch.int64)
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
      seq_tensor[idx,:seqlen] = torch.tensor(seq, dtype=torch.int64)
    
        
    return seq_tensor.to(device), torch.tensor(list_label,dtype=torch.int64).to(device)

# Création du modèle BI-LSTM


In [14]:

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        super(TextClassificationModel, self).__init__()
      
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=1)
        self.lstm = nn.LSTM(input_size=embed_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*hidden_dim, num_class)
       # self.hidden = self.init_hidden()

    '''def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim)),   
                autograd.Variable(torch.zeros(2, self.batch_size, self.hidden_dim))) '''

    def forward(self, text):
        embed_text = self.embedding(text)
        _,(hn,_) = self.lstm(embed_text)
        x = torch.cat((hn[0,:,:],hn[1,:,:]),1)
        x = self.drop(x)
      
        return torch.sigmoid(self.fc(x))

# Fonction pour entrainer et évaluer le modèle. On sélectionne les paramètres du modèle qui obtiennent le meilleur roc_auc_score en moyenne sur les 6 classes

In [61]:

def train_model(model, X, y, optimizer, batch_size=10, num_epochs=9):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=5, test_size=0.2)
    toxic_train = ToxicCommentDataset(X_train, y_train)
    trainloader = DataLoader(ToxicCommentDataset(X_train, y_train), batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    valloader   = DataLoader(ToxicCommentDataset(X_val, y_val), batch_size=500, shuffle=False, collate_fn=collate_batch)
    criterion = nn.BCELoss(reduction='sum')
    since = time.time()
    loss_val_history = []
    map_history = []
    roc_auc_history = []

    best_map = 0.0
    best_roc_auc = 0.0
    for epoch in range(1, num_epochs+1):
        
        running_train_loss = 0.0
        best_model_wts = copy.deepcopy(model.state_dict())
        
        
        model.train()
        # Iterate over data.
        for j, (inputs, labels) in tq.tqdm(enumerate(trainloader)):
        
                # zero the parameter gradients
                optimizer.zero_grad()
              
                # forward
                # track history if only in train
                outputs = model(inputs)
                
                loss = criterion(outputs, labels.float())
                
                #running_corrects += torch.sum(preds == labels.data[:,z])
                # backward + optimize only if in training phase
                loss.backward()
                optimizer.step()
          
                running_train_loss += loss.item()
             
                if j%364 == 0:
                  model.eval()  
                  map = 0 
                  i = 0
                  running_val_loss = 0.0
                  predict_proba = np.zeros((len(valloader.dataset), num_class))
                  if j==0 and epoch==1:
                    y_true = np.zeros((len(valloader.dataset), num_class))

                  with torch.no_grad():   
                    for val_inp, val_lab in valloader:
                      
                      val_out = model(val_inp)
                     
                      loss = criterion(val_out, val_lab.float())
                      running_val_loss += loss.item() 
                      preds = torch.where(val_out<0.5,0,1)
                    
                      predict_proba[i*len(val_lab):(i+1)*len(val_lab),:] = val_out.detach().cpu().numpy()
                      if j==0 and epoch==1:
                        y_true[i*len(val_lab):(i+1)*len(val_lab),:] = val_lab.detach().cpu().numpy()
                      map += (preds==val_lab).float().mean()
                      i += 1
                  mean_roc_auc=0
                  for u in range(num_class):
                    mean_roc_auc += roc_auc_score(y_true[:,u], predict_proba[:,u])
                  mean_roc_auc = mean_roc_auc/num_class

                  runnig_val_loss = running_val_loss/(len(valloader.dataset)*num_class)
                  map = map/i
                  roc_auc_history.append(mean_roc_auc)
                  loss_val_history.append(running_val_loss)
                  map_history.append(map)
                  print(f"Epoch[{epoch}/{num_epochs}], batch[{j+1}/{len(trainloader.dataset)//batch_size}]: val_loss={running_val_loss:.3f}",
                        f"mean average precision={map:.3f}", f" mean_roc_auc={mean_roc_auc:.3f}")
                  if mean_roc_auc > best_roc_auc:
                    best_model_wts = copy.deepcopy(model.state_dict())

                  model.train()
    model.load_state_dict(best_model_wts)
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
          time_elapsed // 60, time_elapsed % 60))
    return model, loss_val_history, map_history, roc_auc_history

# Entraînement et évaluation du modèle

In [95]:
batch_size = 50
model = TextClassificationModel(vocab_size,100,100,num_class).to(device)
optimizer = optim.Adam(model.parameters(),lr=1e-3)

In [96]:
model, loss_val_history, map_history, roc_auc_history = train_model(model, X, y, optimizer, batch_size=50, num_epochs=5)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[1/5],batch[1/2553]: val_loss=128110.328 mean average precision=0.762  mean_roc_auc=0.488
Epoch[1/5],batch[365/2553]: val_loss=20061.802 mean average precision=0.970  mean_roc_auc=0.833
Epoch[1/5],batch[729/2553]: val_loss=15133.885 mean average precision=0.976  mean_roc_auc=0.922
Epoch[1/5],batch[1093/2553]: val_loss=13057.624 mean average precision=0.978  mean_roc_auc=0.950
Epoch[1/5],batch[1457/2553]: val_loss=12345.977 mean average precision=0.978  mean_roc_auc=0.957
Epoch[1/5],batch[1821/2553]: val_loss=11813.394 mean average precision=0.979  mean_roc_auc=0.959
Epoch[1/5],batch[2185/2553]: val_loss=10891.167 mean average precision=0.981  mean_roc_auc=0.965
Epoch[1/5],batch[2549/2553]: val_loss=10612.339 mean average precision=0.981  mean_roc_auc=0.968



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[2/5],batch[1/2553]: val_loss=10562.894 mean average precision=0.981  mean_roc_auc=0.969
Epoch[2/5],batch[365/2553]: val_loss=10209.362 mean average precision=0.981  mean_roc_auc=0.971
Epoch[2/5],batch[729/2553]: val_loss=11991.170 mean average precision=0.980  mean_roc_auc=0.947
Epoch[2/5],batch[1093/2553]: val_loss=10530.824 mean average precision=0.981  mean_roc_auc=0.968
Epoch[2/5],batch[1457/2553]: val_loss=9927.492 mean average precision=0.982  mean_roc_auc=0.973
Epoch[2/5],batch[1821/2553]: val_loss=9773.583 mean average precision=0.982  mean_roc_auc=0.974
Epoch[2/5],batch[2185/2553]: val_loss=9727.911 mean average precision=0.982  mean_roc_auc=0.974
Epoch[2/5],batch[2549/2553]: val_loss=9455.142 mean average precision=0.982  mean_roc_auc=0.977



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[3/5],batch[1/2553]: val_loss=9410.352 mean average precision=0.982  mean_roc_auc=0.977
Epoch[3/5],batch[365/2553]: val_loss=9558.419 mean average precision=0.982  mean_roc_auc=0.977
Epoch[3/5],batch[729/2553]: val_loss=9200.189 mean average precision=0.982  mean_roc_auc=0.978
Epoch[3/5],batch[1093/2553]: val_loss=9170.535 mean average precision=0.983  mean_roc_auc=0.979
Epoch[3/5],batch[1457/2553]: val_loss=9274.384 mean average precision=0.982  mean_roc_auc=0.979
Epoch[3/5],batch[1821/2553]: val_loss=8947.084 mean average precision=0.983  mean_roc_auc=0.979
Epoch[3/5],batch[2185/2553]: val_loss=12276.974 mean average precision=0.979  mean_roc_auc=0.960
Epoch[3/5],batch[2549/2553]: val_loss=10277.927 mean average precision=0.981  mean_roc_auc=0.970



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[4/5],batch[1/2553]: val_loss=10349.937 mean average precision=0.981  mean_roc_auc=0.970
Epoch[4/5],batch[365/2553]: val_loss=9690.702 mean average precision=0.982  mean_roc_auc=0.975
Epoch[4/5],batch[729/2553]: val_loss=9303.316 mean average precision=0.982  mean_roc_auc=0.977
Epoch[4/5],batch[1093/2553]: val_loss=9116.590 mean average precision=0.983  mean_roc_auc=0.978
Epoch[4/5],batch[1457/2553]: val_loss=9106.045 mean average precision=0.983  mean_roc_auc=0.979
Epoch[4/5],batch[1821/2553]: val_loss=9071.329 mean average precision=0.983  mean_roc_auc=0.979
Epoch[4/5],batch[2185/2553]: val_loss=8984.179 mean average precision=0.983  mean_roc_auc=0.980
Epoch[4/5],batch[2549/2553]: val_loss=8950.120 mean average precision=0.983  mean_roc_auc=0.981



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[5/5],batch[1/2553]: val_loss=8839.346 mean average precision=0.983  mean_roc_auc=0.981
Epoch[5/5],batch[365/2553]: val_loss=8872.803 mean average precision=0.983  mean_roc_auc=0.981
Epoch[5/5],batch[729/2553]: val_loss=8875.332 mean average precision=0.983  mean_roc_auc=0.980
Epoch[5/5],batch[1093/2553]: val_loss=8924.741 mean average precision=0.983  mean_roc_auc=0.981
Epoch[5/5],batch[1457/2553]: val_loss=9017.682 mean average precision=0.983  mean_roc_auc=0.981
Epoch[5/5],batch[1821/2553]: val_loss=9176.468 mean average precision=0.983  mean_roc_auc=0.979
Epoch[5/5],batch[2185/2553]: val_loss=9124.640 mean average precision=0.983  mean_roc_auc=0.980
Epoch[5/5],batch[2549/2553]: val_loss=8976.265 mean average precision=0.983  mean_roc_auc=0.981

Training complete in 40m 60s


# On définit le deuxième modèle

In [65]:
model2, loss_val_history, map_history, roc_auc_history = train_model(model, X, y, optimizer, batch_size=50,num_epochs=2)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[1/2],batch[1/2553]: val_loss=9043.986 mean average precision=0.983  mean_roc_auc=0.981
Epoch[1/2],batch[365/2553]: val_loss=9252.085 mean average precision=0.983  mean_roc_auc=0.982
Epoch[1/2],batch[729/2553]: val_loss=9289.302 mean average precision=0.982  mean_roc_auc=0.982
Epoch[1/2],batch[1093/2553]: val_loss=9458.588 mean average precision=0.983  mean_roc_auc=0.981
Epoch[1/2],batch[1457/2553]: val_loss=9291.348 mean average precision=0.983  mean_roc_auc=0.981
Epoch[1/2],batch[1821/2553]: val_loss=9463.379 mean average precision=0.983  mean_roc_auc=0.982
Epoch[1/2],batch[2185/2553]: val_loss=9296.281 mean average precision=0.983  mean_roc_auc=0.982
Epoch[1/2],batch[2549/2553]: val_loss=9679.643 mean average precision=0.983  mean_roc_auc=0.981



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Epoch[2/2],batch[1/2553]: val_loss=9654.759 mean average precision=0.983  mean_roc_auc=0.981
Epoch[2/2],batch[365/2553]: val_loss=9769.775 mean average precision=0.982  mean_roc_auc=0.981
Epoch[2/2],batch[729/2553]: val_loss=9612.522 mean average precision=0.982  mean_roc_auc=0.981
Epoch[2/2],batch[1093/2553]: val_loss=9527.517 mean average precision=0.982  mean_roc_auc=0.981
Epoch[2/2],batch[1457/2553]: val_loss=9700.274 mean average precision=0.982  mean_roc_auc=0.981
Epoch[2/2],batch[1821/2553]: val_loss=10005.186 mean average precision=0.983  mean_roc_auc=0.981
Epoch[2/2],batch[2185/2553]: val_loss=9832.951 mean average precision=0.983  mean_roc_auc=0.982
Epoch[2/2],batch[2549/2553]: val_loss=9481.984 mean average precision=0.982  mean_roc_auc=0.982

Training complete in 16m 25s


# On réalise nos prédictions sur le test set

In [75]:
test = pd.read_csv("test.csv", sep=',')
print(test)
X_test = test["comment_text"].apply(clean_text)

# X_test

                      id                                       comment_text
0       00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
1       0000247867823ef7  == From RfC == \n\n The title is fine as it is...
2       00013b17ad220c46  " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3       00017563c3f7919a  :If you have a look back at the source, the in...
4       00017695ad8997eb          I don't anonymously edit articles at all.
...                  ...                                                ...
153159  fffcd0960ee309b5  . \n i totally agree, this stuff is nothing bu...
153160  fffd7a9a6eb32c16  == Throw from out field to home plate. == \n\n...
153161  fffda9e8d6fafa9e  " \n\n == Okinotorishima categories == \n\n I ...
153162  fffe8f1340a79fc2  " \n\n == ""One of the founding nations of the...
153163  ffffce3fb183ee80  " \n :::Stop already. Your bullshit is not wel...

[153164 rows x 2 columns]


0         yo bitch ja rule is more succesful then you ll...
1                   from rfc the title is fine as it is imo
2                          sources zawe ashton on lapland —
3         if you have a look back at the source the info...
4                  i don t anonymously edit articles at all
                                ...                        
153159    i totally agree this stuff is nothing but too ...
153160    throw from out field to home plate does it get...
153161    okinotorishima categories i see your changes a...
153162    one of the founding nations of the eu germany ...
153163    stop already your bullshit is not welcome here...
Name: comment_text, Length: 153164, dtype: object

# On définit une deuxième fonction pour traiter les batchs du dataloader pour le train set  afin qu'il prennent en compte ta taille maximale des longueurs des séquences

In [82]:
def collate_batch2(batch):
    vectorized_seqs = []
    list_label = []
    for text, label in batch:
      vectorized_seqs.append(text_pipeline(text))
      list_label.append(label)
      
    seq_lengths = list(map(len, vectorized_seqs))
    seq_lengths = list(map(lambda x: min(x, max_len), seq_lengths))
    seq_tensor = torch.ones(size=(len(batch), max_len), dtype=torch.int64)
    for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
      seq_tensor[idx, :seqlen] = torch.tensor(seq[:max_len], dtype=torch.int64)
    
    return seq_tensor.to(device), torch.tensor(list_label, dtype=torch.int64).to(device)

In [97]:
testloader = DataLoader(ToxicCommentDataset(X_test, y.iloc[:len(X_test)]), batch_size=500, shuffle=False, collate_fn=collate_batch2)
predict_proba = np.zeros((len(testloader.dataset),num_class))
model.eval()
for i, (test_inp,_) in tq.tqdm(enumerate(testloader)):
  test_out = model(test_inp)

  predict_proba[i*test_inp.shape[0]:(i+1)*test_inp.shape[0],:] = test_out.detach().cpu().numpy()

submit1 = pd.DataFrame(data=predict_proba,columns=train.columns[2:])
submit1 = pd.concat((test["id"],submit1),axis=1)
submit1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998485,3.706980e-01,0.986276,0.055662,0.929928,0.318903
1,0000247867823ef7,0.000203,2.368356e-07,0.000033,0.000005,0.000022,0.000008
2,00013b17ad220c46,0.001258,1.261858e-06,0.000141,0.000013,0.000133,0.000045
3,00017563c3f7919a,0.001312,3.804437e-07,0.000088,0.000006,0.000060,0.000012
4,00017695ad8997eb,0.004181,8.949813e-06,0.000407,0.000084,0.000338,0.000154
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000
153160,fffd7a9a6eb32c16,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000
153161,fffda9e8d6fafa9e,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000
153162,fffe8f1340a79fc2,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000


In [93]:

predict_proba2 = np.zeros((len(testloader.dataset),num_class))
model2.eval()
for i, (test_inp, _) in tq.tqdm(enumerate(testloader)):
                      
  test_out = model2(test_inp)

  predict_proba[i*test_inp.shape[0]:(i+1)*test_inp.shape[0],:] = test_out.detach().cpu().numpy()
submit2 = pd.DataFrame(data=predict_proba,columns=train.columns[2:])
submit2 = pd.concat((test["id"],submit2),axis=1)
submit2

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999314,0.240307,0.992158,0.062269,0.976407,0.109066
1,0000247867823ef7,0.000485,0.000011,0.000082,0.000014,0.000121,0.000067
2,00013b17ad220c46,0.000318,0.000039,0.000095,0.000031,0.000106,0.000074
3,00017563c3f7919a,0.000181,0.000011,0.000031,0.000012,0.000077,0.000033
4,00017695ad8997eb,0.000764,0.000058,0.000328,0.000025,0.000304,0.000125
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
153160,fffd7a9a6eb32c16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
153161,fffda9e8d6fafa9e,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
153162,fffe8f1340a79fc2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [98]:
submit1.to_csv("submit3.csv", index=False)
submit2.to_csv("submit2.csv", index=False)