In [2]:
import pandas as pd
import numpy as np
import os
import random
from sklearn.model_selection import train_test_split
import torch

In [3]:
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
np.random.seed(33)
random.seed(33)

### Set up paths

In [5]:
data='DATA'
raw_data_folder='RAW_DATA'
processed_data_folder='PROCESSED_DATA'
word2vec_folder='word2vec'
tfid_folder='tfid'
final_dict='grp0_dict'
train_folder='TRAIN'
test_folder='TEST'

In [6]:
if data not in os.listdir(os.getcwd()):
    os.makedirs(os.path.join(data))
DATA_PATH=os.path.join(data)
if raw_data_folder not in os.listdir(os.path.join(DATA_PATH)):
    os.makedirs(os.path.join(DATA_PATH,raw_data_folder))
RAW_DATA_PATH=os.path.join(DATA_PATH,raw_data_folder)
if processed_data_folder not in os.listdir(os.path.join(DATA_PATH)):
    os.makedirs(os.path.join(DATA_PATH,processed_data_folder))
PROCESSED_DATA_PATH=os.path.join(DATA_PATH,processed_data_folder)
if raw_data_folder not in os.listdir(os.path.join(DATA_PATH)):
    os.makedirs(os.path.join(DATA_PATH,raw_data_folder))
if train_folder not in os.listdir(os.path.join(PROCESSED_DATA_PATH)):
    os.makedirs(os.path.join(train_folder))
TRAIN_PATH=os.path.join(PROCESSED_DATA_PATH,train_folder)
if test_folder not in os.listdir(os.path.join(PROCESSED_DATA_PATH)):
    os.makedirs(os.path.join(PROCESSED_DATA_PATH,test_folder))
TEST_PATH=os.path.join(PROCESSED_DATA_PATH,test_folder)

In [7]:
df=pd.read_csv(os.path.join(RAW_DATA_PATH,'Labels.csv'), encoding = 'latin-1')

In [8]:
df.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Short description,Description,Caller,Assignment group,Lang_Short_Description,Lang_Description,Final_description,Total,Grp_change_1,group_cpy
0,0,0,login issue,-verified user details.(employee# & manager na...,spxjnwir pjlcoqds,GRP_0,en,en,login issue -verifie user details.(employee # ...,login issue -verifie user details.(employee # ...,5,GRP_0
1,1,1,outlook,\n\nreceived from: hmjdrvpb.komuaywn@gmail.com...,hmjdrvpb komuaywn,GRP_0,en,en,outlook receive from : hmjdrvpb.komuaywn@gm...,outlook receive from : hmjdrvpb.komuaywn@gm...,1,GRP_0
2,2,2,cant log in to vpn,\n\nreceived from: eylqgodm.ybqkwiam@gmail.com...,eylqgodm ybqkwiam,GRP_0,en,en,can not log in to vpn receive from : eylqgo...,can not log in to vpn receive from : eylqgo...,2,GRP_0
3,3,3,unable to access hr_tool page,unable to access hr_tool page,xbkucsvz gcpydteq,GRP_0,en,en,unable to access hr_tool page xbkucsvz gcpydteq,unable to access hr_tool page,1,GRP_0
4,4,4,skype error,skype error,owlgqjme qhcozdfx,GRP_0,en,en,skype error owlgqjme qhcozdfx,skype error,4,GRP_0
5,5,5,unable to log in to engineering tool and skype,unable to log in to engineering tool and skype,eflahbxn ltdgrvkz,GRP_0,en,en,unable to log in to engineering tool and skype...,unable to log in to engineering tool and skype,2,GRP_0
6,6,6,event: critical:HostName_221.company.com the v...,event: critical:HostName_221.company.com the v...,jyoqwxhz clhxsoqy,GRP_1,en,en,event : critical : hostname_221.company.com th...,event : critical : hostname_221.company.com th...,Other,GRP_1
7,7,7,ticket_no1550391- employment status - new non-...,ticket_no1550391- employment status - new non-...,eqzibjhw ymebpoih,GRP_0,en,en,ticket_no1550391- employment status - new non ...,ticket_no1550391- employment status - new non ...,6,GRP_0
8,8,8,unable to disable add ins on outlook,unable to disable add ins on outlook,mdbegvct dbvichlg,GRP_0,en,en,unable to disable add in on outlook mdbegvct d...,unable to disable add in on outlook,5,GRP_0
9,9,9,ticket update on inplant_874773,ticket update on inplant_874773,fumkcsji sarmtlhy,GRP_0,en,en,ticket update on inplant_874773 fumkcsji sarmtlhy,ticket update on inplant_874773,0,GRP_0


The features data will be got from final_description while the labels will be got from Grp_change_1

In [9]:
df[df['Grp_change_1'].isna()]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Short description,Description,Caller,Assignment group,Lang_Short_Description,Lang_Description,Final_description,Total,Grp_change_1,group_cpy


In [10]:
df[df['Final_description'].isna()]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Short description,Description,Caller,Assignment group,Lang_Short_Description,Lang_Description,Final_description,Total,Grp_change_1,group_cpy


### Train test split

In [11]:
X_train, X_test, y_train, y_test=train_test_split(df['Final_description'].values,df['Grp_change_1'].values,test_size=0.1,stratify=df['Grp_change_1'],random_state=33)

### Tokenizing the features

In [12]:
train_features=tokenizer(list(X_train),add_special_tokens=True,max_length=512,padding='max_length',truncation=True,return_tensors='pt')
test_features=tokenizer(list(X_test),add_special_tokens=True,max_length=512,padding='max_length',truncation=True,return_tensors='pt')

In [13]:
train_features['input_ids'].shape

torch.Size([7650, 512])

In [14]:
torch.save(train_features,os.path.join(TRAIN_PATH,'train_features.pt'))
torch.save(test_features,os.path.join(TEST_PATH,'test_features.pt'))

### Creating the labels 

In [15]:
len(df['Grp_change_1'].unique())

28

In [16]:
dict_val_to_target={}
dict_target_to_val={}
for key, value in enumerate(df['Grp_change_1'].unique()):
    dict_val_to_target[value]=key
for key, value in dict_val_to_target.items():
    dict_target_to_val[value]=key

In [17]:
print(dict_val_to_target,'\n')
print(dict_target_to_val,'\n')

{'5': 0, '1': 1, '2': 2, '4': 3, 'Other': 4, '6': 5, '0': 6, 'GRP_3': 7, '3': 8, '8': 9, 'GRP_4': 10, 'GRP_5': 11, 'GRP_6': 12, 'GRP_8': 13, 'GRP_9': 14, 'GRP_10': 15, 'GRP_12': 16, 'GRP_13': 17, 'GRP_14': 18, '7': 19, 'GRP_16': 20, 'GRP_18': 21, 'GRP_19': 22, 'GRP_2': 23, 'GRP_24': 24, 'GRP_25': 25, 'GRP_29': 26, 'GRP_33': 27} 

{0: '5', 1: '1', 2: '2', 3: '4', 4: 'Other', 5: '6', 6: '0', 7: 'GRP_3', 8: '3', 9: '8', 10: 'GRP_4', 11: 'GRP_5', 12: 'GRP_6', 13: 'GRP_8', 14: 'GRP_9', 15: 'GRP_10', 16: 'GRP_12', 17: 'GRP_13', 18: 'GRP_14', 19: '7', 20: 'GRP_16', 21: 'GRP_18', 22: 'GRP_19', 23: 'GRP_2', 24: 'GRP_24', 25: 'GRP_25', 26: 'GRP_29', 27: 'GRP_33'} 



In [18]:
y_train_=torch.tensor([dict_val_to_target[y_tr] for y_tr in y_train])
y_test_=torch.tensor([dict_val_to_target[y_ts] for y_ts in y_test])

In [19]:
y_train_.shape

torch.Size([7650])

In [20]:
torch.save(y_train_,os.path.join(TRAIN_PATH,'train_labels.pt'))
torch.save(y_test_,os.path.join(TEST_PATH,'test_labels.pt'))

### Getting the weights balance

In [21]:
y=np.array([dict_val_to_target[y_tr] for y_tr in df['Grp_change_1']])

In [22]:
from sklearn.utils.class_weight import compute_class_weight
balance=compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
balance

array([0.57823129, 0.29022125, 0.65993789, 0.40314931, 0.27597403,
       0.90080543, 1.99718045, 1.51785714, 0.83170254, 1.53318903,
       3.03571429, 2.35326689, 1.64984472, 0.45926086, 1.20464853,
       2.16836735, 1.18121178, 2.09359606, 2.57263923, 2.16836735,
       3.57142857, 3.44967532, 1.41196013, 1.25963248, 1.05042017,
       2.61699507, 3.12960236, 2.83711615])

In [23]:
np.save(os.path.join(TRAIN_PATH,'balance.npy'),balance)

In [24]:
len(balance)

28