In [1]:
import pandas as pd 
import numpy as np

In [2]:
ticket_dat = pd.read_csv('cleaned_translated.csv')
FAQ_dat = pd.read_csv('FAQ_translation.csv')
print('Ticket Data: {} Rows'.format(ticket_dat.shape[0]))
print('FAQ Data: {} Rows'.format(FAQ_dat.shape[0]))

Ticket Data: 18288 Rows
FAQ Data: 389 Rows


In [3]:
# Print Columns in Ticket Data
ticket_dat.columns.values

array(['tn', 'title', 'sensitiv', 'Queue', 'name', 'create_time', 'id',
       'ticket_id', 'article_type_id', 'article_sender_type_id', 'a_from',
       'a_reply_to', 'a_to', 'a_cc', 'a_subject', 'a_message_id',
       'a_content_type', 'a_body', 'incoming_time', 'content_path',
       'valid_id', 'create_time1', 'create_by', 'change_time',
       'change_by', 'a_in_reply_to', 'a_references', 'a_message_id_md5',
       'a_body_clean', 'lang', 'translation'], dtype=object)

In [4]:
# Print Columns in FAQ Data
FAQ_dat.columns.values

array(['ques_id', 'language', 'ques', 'ansTitle', 'ansContent',
       'translation', 'translation_title', 'ques_translation'],
      dtype=object)

# Lets Settle for A Final Form of the Data
## Tickets
ticket_no, article_id (with prefix for ticket vs FAQ), content_original, content_clean, lang, content_translate

## FAQs
ques_id (prefix for FAQ question), ans_id (prefix for FAQ ans), ansTitle_cleaned, ansContent_cleaned, ansTitle_translate, ansContent_translate, quesContent_cleaned, quesContent_translate




# Ticket Data

# Removing Some Rows and Final Preprocessing

First remove non english / originally german enteries from ticket data (minor loss of data). Remember though, some "conversations" lost as a result.

In [5]:
ticket_dat.groupby('lang').count().tn

lang
ar           1
co           3
cs           1
da           7
de       14498
el           1
en        3738
fi           2
fr           7
gd           3
haw          1
hi           1
hmn          1
id           1
it           1
ku           1
lb           8
no           3
pl           3
ro           1
sk           1
sl           1
tr           1
zh-CN        3
Name: tn, dtype: int64

In [6]:
ticket_dat = ticket_dat[ticket_dat.lang.isin(['en', 'de'])]
print('{} Rows Remain'.format(ticket_dat.shape[0]))

18236 Rows Remain


Remove data with NaN in the content


In [7]:
print('{} NaN enteries removed'.format(pd.isnull(ticket_dat.a_body_clean).sum()))
ticket_dat = ticket_dat[~pd.isnull(ticket_dat.a_body_clean)]

32 NaN enteries removed


## Rename some Columns

In [8]:
ticket_dat.columns.values

array(['tn', 'title', 'sensitiv', 'Queue', 'name', 'create_time', 'id',
       'ticket_id', 'article_type_id', 'article_sender_type_id', 'a_from',
       'a_reply_to', 'a_to', 'a_cc', 'a_subject', 'a_message_id',
       'a_content_type', 'a_body', 'incoming_time', 'content_path',
       'valid_id', 'create_time1', 'create_by', 'change_time',
       'change_by', 'a_in_reply_to', 'a_references', 'a_message_id_md5',
       'a_body_clean', 'lang', 'translation'], dtype=object)

In [9]:
ticket_column_rename={"tn": "ticket_id", "id": "article_id", "a_body": "content_original", 
                      "a_body_clean": "content_cleaned", "translation": "content_translated",
                     'a_subject': "subject"}

ticket_dat.rename(index=str, columns = ticket_column_rename, inplace=True)

## Drop Some Columns

In [10]:
ticket_columns_to_drop = ['title', 'sensitiv', 'Queue', 'name', 'create_time', 'article_type_id', 
                          'article_sender_type_id', 'a_from', 'a_reply_to', 'a_to', 'a_cc', 
                          'a_message_id', 'a_content_type', 'incoming_time', 'content_path',
                          'valid_id', 'create_time1', 'create_by', 'change_time', 'change_by', 'a_in_reply_to',
                          'a_references', 'a_message_id_md5']

ticket_dat.drop(ticket_columns_to_drop, axis=1, inplace=True)

In [11]:
ticket_dat.head()

Unnamed: 0,ticket_id,article_id,ticket_id.1,subject,content_original,content_cleaned,lang,content_translated
0,18014685,2177823,424446,Registrierung als Hörer,Sehr geehrte Damen und Herren\n\n \n\nIch habe...,Sehr geehrte Damen und Herren\n\n \n\nIch habe...,de,"Dear Ladies and Gentlemen, I still have about ..."
1,18014685,2177987,424446,Re: [ID#18014685] Registrierung als Hörer,Sehr geehrter Herr Daniel Aeppli \n\nGlücklich...,Sehr geehrter Herr Daniel Aeppli \n\nGlücklich...,de,"Dear Mr. Daniel Aeppli Fortunately, you have a..."
2,18014685,2178318,424446,Re: [ID#18014685] Registrierung als Hörer,Guten Tag Herr Greutee\n\nBesten Dank.\n\nDie ...,Guten Tag Herr Greutee\n\nBesten Dank.\n\nDie ...,de,Hello Mr. Greutee Thank you. The details: 25.0...
3,18014685,2178343,424446,Re: [ID#18014685] Registrierung als Hörer,Sehr geehrter Herr Daniel Aeppli\n \nIch habe ...,Sehr geehrter Herr Daniel Aeppli\n \nIch habe ...,de,Dear Mr. Daniel Aeppli I have sent you a passw...
4,18014685,2326946,424446,Re: [ID#18014685] Registrierung als Hörer,Guten Tag Herr Greuter\n\nMit dem abgeänderten...,Guten Tag Herr Greuter\n\nMit dem abgeänderten...,de,Hello Mr. Greuter With the changed password I ...


# FAQ Data

In [12]:
FAQ_dat.head()

Unnamed: 0,ques_id,language,ques,ansTitle,ansContent,translation,translation_title,ques_translation
0,8502,EN,Received a phishing mail?,Phishing Mail warning,This is a phishing mail. You should delete it....,This is a phishing mail. You should delete it....,Phishing Mail warning,Received a phishing mail?
1,8503,EN,Lockout on Account,My account is locked,You had a lockout on your account because you ...,You had a lockout on your account because you ...,My account is locked,Lockout on Account
2,8506,EN,Blocking mailsender,Block a mailsender,You can add the sender on your personal blackl...,You can add the sender on your personal blackl...,Block a mailsender,Blocking mailsender
3,8509,EN,Credit overdrawn,I can't print anymore,Your credit is 6.90.- in minus. You have to lo...,Your credit is 6.90.- in minus. You have to lo...,I can't print anymore,Credit overdrawn
4,8509,EN,Credit overdrawn,Why is my account in minus,The reason why your account is minus is the fo...,The reason why your account is minus is the fo...,Why is my account in minus,Credit overdrawn


In [13]:
# Rename Columns First
faq_column_rename={"ques_id": "faq_id", "language": "lang", "translation": "ans_content_translated", 
                      "ques_translation": "ques_content_translation", "translation_title": "ans_title_translation",
                     'ansTitle': "ans_title", "ansContent": "ans_content"}

FAQ_dat.rename(index=str, columns = faq_column_rename, inplace=True)

In [14]:
# Make the langauge lower case 
FAQ_dat['lang'] = FAQ_dat.lang.str.lower()

In [15]:
FAQ_dat.head()

Unnamed: 0,faq_id,lang,ques,ans_title,ans_content,ans_content_translated,ans_title_translation,ques_content_translation
0,8502,en,Received a phishing mail?,Phishing Mail warning,This is a phishing mail. You should delete it....,This is a phishing mail. You should delete it....,Phishing Mail warning,Received a phishing mail?
1,8503,en,Lockout on Account,My account is locked,You had a lockout on your account because you ...,You had a lockout on your account because you ...,My account is locked,Lockout on Account
2,8506,en,Blocking mailsender,Block a mailsender,You can add the sender on your personal blackl...,You can add the sender on your personal blackl...,Block a mailsender,Blocking mailsender
3,8509,en,Credit overdrawn,I can't print anymore,Your credit is 6.90.- in minus. You have to lo...,Your credit is 6.90.- in minus. You have to lo...,I can't print anymore,Credit overdrawn
4,8509,en,Credit overdrawn,Why is my account in minus,The reason why your account is minus is the fo...,The reason why your account is minus is the fo...,Why is my account in minus,Credit overdrawn


# Final Two Datasets

## Ticket

In [16]:
ticket_dat.head(10)

Unnamed: 0,ticket_id,article_id,ticket_id.1,subject,content_original,content_cleaned,lang,content_translated
0,18014685,2177823,424446,Registrierung als Hörer,Sehr geehrte Damen und Herren\n\n \n\nIch habe...,Sehr geehrte Damen und Herren\n\n \n\nIch habe...,de,"Dear Ladies and Gentlemen, I still have about ..."
1,18014685,2177987,424446,Re: [ID#18014685] Registrierung als Hörer,Sehr geehrter Herr Daniel Aeppli \n\nGlücklich...,Sehr geehrter Herr Daniel Aeppli \n\nGlücklich...,de,"Dear Mr. Daniel Aeppli Fortunately, you have a..."
2,18014685,2178318,424446,Re: [ID#18014685] Registrierung als Hörer,Guten Tag Herr Greutee\n\nBesten Dank.\n\nDie ...,Guten Tag Herr Greutee\n\nBesten Dank.\n\nDie ...,de,Hello Mr. Greutee Thank you. The details: 25.0...
3,18014685,2178343,424446,Re: [ID#18014685] Registrierung als Hörer,Sehr geehrter Herr Daniel Aeppli\n \nIch habe ...,Sehr geehrter Herr Daniel Aeppli\n \nIch habe ...,de,Dear Mr. Daniel Aeppli I have sent you a passw...
4,18014685,2326946,424446,Re: [ID#18014685] Registrierung als Hörer,Guten Tag Herr Greuter\n\nMit dem abgeänderten...,Guten Tag Herr Greuter\n\nMit dem abgeänderten...,de,Hello Mr. Greuter With the changed password I ...
6,18022254,2214703,432015,"[ID#18022254] Malware Suspicion ""ETPRO TROJAN ...",Dear Pelyuan Zhang\n\nWe contact you because o...,Dear Pelyuan Zhang\n\nWe contact you because o...,en,Dear Pelyuan Zhang\n\nWe contact you because o...
9,18022254,2305936,432015,"Re: [ID#18022254] Malware Suspicion ""ETPRO TRO...",Dear Pelyuan Zhang\n\nCould you look into this...,Dear Pelyuan Zhang\n\nCould you look into this...,en,Dear Pelyuan Zhang\n\nCould you look into this...
10,18022254,2307137,432015,Re: Change Queue!,VPN ist gesperrt\n\n07/02/2018 09:25 - Elmar S...,VPN ist gesperrt\n\n07/02/2018 09:25 - Elmar S...,de,VPN is blocked 07/02/2018 09:25 - Elmar Sales ...
11,18022254,2401081,432015,"RE: [ID#18022254] Malware Suspicion ""ETPRO TRO...","Dear Elmar Heeb,\n\nI am sorry for replying so...","Dear Elmar Heeb,\n\nI am sorry for replying so...",en,"Dear Elmar Heeb,\n\nI am sorry for replying so..."
12,18022254,2401666,432015,"Re: [ID#18022254] Malware Suspicion ""ETPRO TRO...",Dear Mr Peiyuan Zhang\n \nYour network is open...,Dear Mr Peiyuan Zhang\n \nYour network is open...,en,Dear Mr Peiyuan Zhang\n \nYour network is open...


## FAQ

In [17]:
FAQ_dat.head(10)

Unnamed: 0,faq_id,lang,ques,ans_title,ans_content,ans_content_translated,ans_title_translation,ques_content_translation
0,8502,en,Received a phishing mail?,Phishing Mail warning,This is a phishing mail. You should delete it....,This is a phishing mail. You should delete it....,Phishing Mail warning,Received a phishing mail?
1,8503,en,Lockout on Account,My account is locked,You had a lockout on your account because you ...,You had a lockout on your account because you ...,My account is locked,Lockout on Account
2,8506,en,Blocking mailsender,Block a mailsender,You can add the sender on your personal blackl...,You can add the sender on your personal blackl...,Block a mailsender,Blocking mailsender
3,8509,en,Credit overdrawn,I can't print anymore,Your credit is 6.90.- in minus. You have to lo...,Your credit is 6.90.- in minus. You have to lo...,I can't print anymore,Credit overdrawn
4,8509,en,Credit overdrawn,Why is my account in minus,The reason why your account is minus is the fo...,The reason why your account is minus is the fo...,Why is my account in minus,Credit overdrawn
5,8528,en,Reset NAS permissions,Reset NAS permissions,I've reset your permissions. Can you please tr...,I've reset your permissions. Can you please tr...,Reset NAS permissions,Reset NAS permissions
6,8511,en,Settings for VPN,VPN settings of the ETH,The settings for the ETH VPN are as follows:Se...,The settings for the ETH VPN are as follows:Se...,VPN settings of the ETH,Settings for VPN
7,8507,en,Mail blocked mistakenly,Mail blocked mistakenly,You can add the sender on your personal whitel...,You can add the sender on your personal whitel...,Mail blocked mistakenly,Mail blocked mistakenly
8,8511,en,Settings for VPN,Installation VPN Client,You can find all the informations for the VPN ...,You can find all the informations for the VPN ...,Installation VPN Client,Settings for VPN
9,8532,en,How can I install software from IDES?,Software installation from IDES,Please download first all software local to y...,Please download first all software local to y...,Software installation from IDES,How can I install software from IDES?


# Export to file

In [18]:
ticket_dat.to_csv('data/ticket_dat.csv')
FAQ_dat.to_csv('data/faq_dat.csv')