# A Text Mining Approach to Analyze The Cyber Security Related Articles

## PART-1: Data Collection

__Parsing the XML files__

In [NFS web site](https://nsf.gov/awardsearch/download.jsp), all articles exist in XML format in separate folders yearly. To reach all data in one, we download all zipped folder and extract all of them in the same folder (for example __NFS_TEXT_Mining__ folder). Then, opening jupyter notebook file to parse all XML files and convert them to a tabular format, like CSV.

__Hint:__ If Jupyter notebok file won't open/run in the folder where all XML files exist, then try to run the same code as .py file with the help of command page.


__Creating A List With All XML File Names__

In [None]:
import pandas as pd
from os import listdir
import xml.etree.ElementTree as ET

#Creating all file names in the folder
file_list=[]
for file_name in listdir('/Users/.../NFS_Text_Mining'):
    file_list.append(file_name)
file_list.remove('XML_Parse.ipynb') #ignore from the file_list

print('The total XML files number is: {}'.format(len(file_list)))

__Create an Empty Pandas DataFrame with Related Columns__

In [None]:
cols=['AwardTitle', 'AwardEffectiveDate','AwardExpirationDate','AwardTotalIntnAmount','AwardAmount','AwardInstrument',
'Organization_Code','Directorate_Abbreviation','Directorate_LongName','Division_Abbreviation','Division_LongName',
'ProgramOfficer','Abstract','MinAmdLetterDate','MaxAmdLetterDate','ARRAAmount','AwardID','Investigator_FirstName',
'Investigator_EmailAddress','Investigator_StartDate','Investigator_EndDate','Investigator_RoleCode','Institution_Name',
'Institution_CityName','Institution_ZipCode','Institution_PhoneNumber','Institution_StreetAddress','Institution_CountryName',
'Institution_StateName','Institution_StateCode','ProgramElement_Code','ProgramElement_Text']

#column names for df_investigator
cols2=['AwardID', 'Investigator_FirstName','Investigator_LastName','Investigator_EmailAddress','Investigator_StartDate',
'Investigator_EndDate', 'Investigator_RoleCode']


df = pd.DataFrame(columns=cols, index=range(len(file_list)))
df_investigator = pd.DataFrame(columns=cols2, index=range(2*len(file_list)))

print("'df' dataframe consist of {} rows and {} columns".format(df.shape[0],df.shape[1]))
print("'df_investigator' dataframe consist of {} rows and {} columns".format(df_investigator.shape[0],df_investigator.shape[1]))

__Parsing The Data And Recording Them To DataFrame__

In [None]:
counter_for_investigators=0
for n,file_name in enumerate (file_list):
    try:
        root = ET.parse(file_name).getroot()
        for data in root.findall('Award'):        
            AwardTitle=data.find('AwardTitle').text
            AwardEffectiveDate=data.find('AwardEffectiveDate').text 
            AwardExpirationDate=data.find('AwardExpirationDate').text
            try:
                AwardTotalIntnAmount=data.find('AwardTotalIntnAmount').text
            except:
                AwardTotalIntnAmount='NaN'
            AwardAmount=data.find('AwardAmount').text 
            Value=data.find('AwardInstrument').find('Value').text
            Organization_Code=data.find('Organization').find('Code').text
            try:
                Directorate_Abbreviation=data.find('Organization').find('Directorate').find('Abbreviation').text
                Directorate_LongName=data.find('Organization').find('Directorate').find('LongName').text
                Division_Abbreviation=data.find('Organization').find('Division').find('Abbreviation').text
                Division_LongName=data.find('Organization').find('Division').find('LongName').text
            except:
                Directorate_Abbreviation='NaN'
                Directorate_LongName='NaN'
                Division_Abbreviation='NaN'
                Division_LongName='NaN'
            ProgramOfficer=data.find('ProgramOfficer').find('SignBlockName').text
            Abstract=data.find('AbstractNarration').text
            MinAmdLetterDate=data.find('MinAmdLetterDate').text
            MaxAmdLetterDate=data.find('MaxAmdLetterDate').text
            ARRAAmount=data.find('ARRAAmount').text
            AwardID=data.find('AwardID').text


            
            for m,investigator in enumerate(data.findall('Investigator')):

                Investigator_FirstName=investigator.find('FirstName').text
                Investigator_LastName=investigator.find('LastName').text
                try:
                    Investigator_EmailAddress=investigator.find('EmailAddress').text
                    Investigator_StartDate=investigator.find('StartDate').text
                    Investigator_EndDate=investigator.find('EndDate').text
                    Investigator_RoleCode=investigator.find('RoleCode').text
                except:
                    Investigator_EmailAddress='NaN'
                    Investigator_StartDate='NaN'
                    Investigator_EndDate='NaN'
                    Investigator_RoleCode='NaN'

                #Add data to df_investigator
                df_investigator.loc[counter_for_investigators+m].AwardID=AwardID
                df_investigator.loc[counter_for_investigators+m].Investigator_FirstName=Investigator_FirstName
                df_investigator.loc[counter_for_investigators+m].Investigator_LastName=Investigator_LastName
                df_investigator.loc[counter_for_investigators+m].Investigator_EmailAddress=Investigator_EmailAddress
                df_investigator.loc[counter_for_investigators+m].Investigator_StartDate=Investigator_StartDate
                df_investigator.loc[counter_for_investigators+m].Investigator_EndDate=Investigator_EndDate
                df_investigator.loc[counter_for_investigators+m].Investigator_RoleCode=Investigator_RoleCode

            counter_for_investigators=counter_for_investigators+m+1

            try:
                Institution_Name=data.find('Institution').find('Name').text
                Institution_CityName=data.find('Institution').find('CityName').text
                Institution_ZipCode=data.find('Institution').find('ZipCode').text
                Institution_PhoneNumber=data.find('Institution').find('PhoneNumber').text
                Institution_StreetAddress=data.find('Institution').find('StreetAddress').text
                Institution_CountryName=data.find('Institution').find('CountryName').text
                Institution_StateName=data.find('Institution').find('StateName').text
                Institution_StateCode=data.find('Institution').find('StateCode').text 

            except:
                Institution_Name='NaN'
                Institution_CityName='NaN'
                Institution_ZipCode='NaN'
                Institution_PhoneNumber='NaN'
                Institution_StreetAddress='NaN'
                Institution_CountryName='NaN'
                Institution_StateName='NaN'
                Institution_StateCode='NaN'
            try:  
                ProgramElement_Code=data.find('ProgramElement').find('Code').text 
                ProgramElement_Text=data.find('ProgramElement').find('Text').text
            except:
                ProgramElement_Code='NaN'
                ProgramElement_Text='NaN'



            #Add data to df
            df.loc[n].AwardTitle=AwardTitle
            df.loc[n].AwardEffectiveDate=AwardEffectiveDate
            df.loc[n].AwardExpirationDate=AwardExpirationDate
            df.loc[n].AwardTotalIntnAmount=AwardTotalIntnAmount
            df.loc[n].AwardAmount=AwardAmount
            df.loc[n].AwardInstrument=Value
            df.loc[n].Organization_Code=Organization_Code
            df.loc[n].Directorate_Abbreviation=Directorate_Abbreviation
            df.loc[n].Directorate_LongName=Directorate_LongName
            df.loc[n].Division_Abbreviation=Division_Abbreviation
            df.loc[n].Division_LongName=Division_LongName
            df.loc[n].ProgramOfficer=ProgramOfficer
            df.loc[n].Abstract=Abstract
            df.loc[n].MinAmdLetterDate=MinAmdLetterDate
            df.loc[n].MaxAmdLetterDate=MaxAmdLetterDate
            df.loc[n].ARRAAmount=ARRAAmount
            df.loc[n].AwardID=AwardID
            df.loc[n].Institution_Name=Institution_Name
            df.loc[n].Institution_CityName=Institution_CityName
            df.loc[n].Institution_ZipCode=Institution_ZipCode
            df.loc[n].Institution_PhoneNumber=Institution_PhoneNumber
            df.loc[n].Institution_StreetAddress=Institution_StreetAddress
            df.loc[n].Institution_CountryName=Institution_CountryName
            df.loc[n].Institution_StateName=Institution_StateName
            df.loc[n].Institution_StateCode=Institution_StateCode
            df.loc[n].ProgramElement_Code=ProgramElement_Code
            df.loc[n].ProgramElement_Text=ProgramElement_Text



        print("Finish", n, file_name)
    except:
        None
        
print("Finish!!!")

__Save The Data__

We prefer to use parquet format because, it store the data by compressed so takes up less space.

In [None]:
df.to_csv('award.csv')
df.to_parquet('award_parquet.gzip',compression='gzip')

In [None]:
df_investigator.to_csv('investigator.csv')
df_investigator.to_parquet('investigator_parquet.gzip',compression='gzip')