# 0. Download Dataset

Import the needed packages. Nltk may needs to be downloaded. Please use pip install for the packages you currently do not have. 

In [6]:
# import of nltk library for further use
import nltk
from nltk import Tree
from nltk.corpus import stopwords
import re
import jieba
import os
import requests
# downloading of the packages in nltk
# nltk.download()

In case your current working directory does not have enough storage, you can change to another local directory. 

In [7]:
# change working directory if needed
os.chdir("D:/Dataset/")

Get the dataset file from github. The original dataset can be found at: https://doi.org/10.5281/zenodo.5591908.

In [12]:
# the text for testing of preprocessing
path_here = 'https://media.githubusercontent.com/media/patrickzw7/TextMiningProject/master/'
f = requests.get(path_here + 'FT-en-zh.txt')
f.encoding = f.apparent_encoding
text_org = f.text

Generate the dataset files using the original raw dataset file to your local directory. Some basic text cleaning is performed.

In [13]:
# split the different texts by the new lines
text_all = text_org.split("\n")

# try to make two new directories on your local working directory
try:
    os.mkdir("DatasetEnglish/")
    os.mkdir("DatasetChinese/")
except:
    pass

# try to do basic text cleaning and write the texts into seperate Chinese and English files for each text
for i in range(len(text_all) - 1):
    text = text_all[i]
    text = re.sub("&#\d\d\d\d", "", text)
    text = text.replace("@", "")
    texts = text.split(";")
    text_name = texts[0]
    text_time = texts[1]
    text_en_title = texts[2]
    text_zh_title = texts[3]
    text_en_text = texts[5].split(">")[-1]
    text_zh_text = texts[6]
    if not os.path.exists(f'{text_name}.txt'):
        with open(f'DatasetEnglish/{text_name}.txt', 'a', encoding = "utf-8") as f:
            f.write(text_en_text)
    if not os.path.exists(f'{text_name}.txt'):
        with open(f'DatasetChinese/{text_name}.txt', 'a', encoding = "utf-8") as f:
            f.write(text_zh_text)

# 1. Preprocessing for English Dataset

Read the sample data (the first file) for English dataset. 

In [14]:
# the text for testing of preprocessing
path_here = r'DatasetEnglish/'
f = open(path_here + '1001571.txt', encoding = 'utf-8')
text = f.read()
text

'London must not be complacent about its continuing pre-eminence as a financial centre but the availability of skilled staff, flexible labour markets and comparatively light regulation now make the City the best place for global institutions to do business, according to the business district’s governing body.  The Corporation of London’s latest research into the ranking of financial centres, published yesterday, also suggests China’s increasing economic power makes it likely to host any new global financial services centre in future years.  Compared with a previous study in 2003, London and New York had moved even further ahead of Frankfurt and Paris when assessed according to criteria seen as important for competitiveness by 400 individuals working in the industry.  This year London was slightly more likely to be named as the chosen location for transactions requiring a global financial centre, whereas New York came top in the previous survey. But the report said the two cities would 

## 1a. Sentence Segmentation

In [15]:
# we use sent_tokenize from nltk to do the sentence segmentation
sentences = nltk.sent_tokenize(text)

# print the segmented sentences
for i in sentences:
    print([i])

['London must not be complacent about its continuing pre-eminence as a financial centre but the availability of skilled staff, flexible labour markets and comparatively light regulation now make the City the best place for global institutions to do business, according to the business district’s governing body.']
['The Corporation of London’s latest research into the ranking of financial centres, published yesterday, also suggests China’s increasing economic power makes it likely to host any new global financial services centre in future years.']
['Compared with a previous study in 2003, London and New York had moved even further ahead of Frankfurt and Paris when assessed according to criteria seen as important for competitiveness by 400 individuals working in the industry.']
['This year London was slightly more likely to be named as the chosen location for transactions requiring a global financial centre, whereas New York came top in the previous survey.']
['But the report said the two

## 1b. Word Segmentation

In [16]:
# we use word_tokenize from nltk to do the word segmentation
tokens = nltk.word_tokenize(text)

# print the segmented words
for i in tokens:
    print([i])

['London']
['must']
['not']
['be']
['complacent']
['about']
['its']
['continuing']
['pre-eminence']
['as']
['a']
['financial']
['centre']
['but']
['the']
['availability']
['of']
['skilled']
['staff']
[',']
['flexible']
['labour']
['markets']
['and']
['comparatively']
['light']
['regulation']
['now']
['make']
['the']
['City']
['the']
['best']
['place']
['for']
['global']
['institutions']
['to']
['do']
['business']
[',']
['according']
['to']
['the']
['business']
['district']
['’']
['s']
['governing']
['body']
['.']
['The']
['Corporation']
['of']
['London']
['’']
['s']
['latest']
['research']
['into']
['the']
['ranking']
['of']
['financial']
['centres']
[',']
['published']
['yesterday']
[',']
['also']
['suggests']
['China']
['’']
['s']
['increasing']
['economic']
['power']
['makes']
['it']
['likely']
['to']
['host']
['any']
['new']
['global']
['financial']
['services']
['centre']
['in']
['future']
['years']
['.']
['Compared']
['with']
['a']
['previous']
['study']
['in']
['2003']
[',']
['L

## 1c.Stop Words Removing

In [17]:
# use word.lower() to avoid the first word in text, like "The", 
# not being removed as only "the" is included in stopwords 
tokens_stop_remove = [word for word in tokens if not word.lower() in set(stopwords.words('english'))]

# print the tokens after removing the stop words
tokens_stop_remove

['London',
 'must',
 'complacent',
 'continuing',
 'pre-eminence',
 'financial',
 'centre',
 'availability',
 'skilled',
 'staff',
 ',',
 'flexible',
 'labour',
 'markets',
 'comparatively',
 'light',
 'regulation',
 'make',
 'City',
 'best',
 'place',
 'global',
 'institutions',
 'business',
 ',',
 'according',
 'business',
 'district',
 '’',
 'governing',
 'body',
 '.',
 'Corporation',
 'London',
 '’',
 'latest',
 'research',
 'ranking',
 'financial',
 'centres',
 ',',
 'published',
 'yesterday',
 ',',
 'also',
 'suggests',
 'China',
 '’',
 'increasing',
 'economic',
 'power',
 'makes',
 'likely',
 'host',
 'new',
 'global',
 'financial',
 'services',
 'centre',
 'future',
 'years',
 '.',
 'Compared',
 'previous',
 'study',
 '2003',
 ',',
 'London',
 'New',
 'York',
 'moved',
 'even',
 'ahead',
 'Frankfurt',
 'Paris',
 'assessed',
 'according',
 'criteria',
 'seen',
 'important',
 'competitiveness',
 '400',
 'individuals',
 'working',
 'industry',
 '.',
 'year',
 'London',
 'slightly

## 1d. Lemmatization and Stemming

In [18]:
#import the lemmatizer library
from nltk.stem import WordNetLemmatizer
#define the lemmatizer
lemma = WordNetLemmatizer()
#define a list to store the words after lemmatizing
tokens_lemmatized = []
#do a loop to lemmatize the words in tokens
for i in range(int(len(tokens))):
    #append the lemmatized version for each word into the new list
    tokens_lemmatized.append(lemma.lemmatize(tokens[i].lower()))
#import the stemming library
from nltk.stem.porter import PorterStemmer
#define the stemmer
stemmer = PorterStemmer()
#define a list to store the words after stemming
tokens_stemmed = []
#do a loop to stem the words in tokens
for i in range(int(len(tokens))):
    #append the stemmed version for each word into the new list
    tokens_stemmed.append(stemmer.stem(tokens[i].lower()))
#print out the lemmatized and stemmed result
[tokens_lemmatized, tokens_stemmed]

[['london',
  'must',
  'not',
  'be',
  'complacent',
  'about',
  'it',
  'continuing',
  'pre-eminence',
  'a',
  'a',
  'financial',
  'centre',
  'but',
  'the',
  'availability',
  'of',
  'skilled',
  'staff',
  ',',
  'flexible',
  'labour',
  'market',
  'and',
  'comparatively',
  'light',
  'regulation',
  'now',
  'make',
  'the',
  'city',
  'the',
  'best',
  'place',
  'for',
  'global',
  'institution',
  'to',
  'do',
  'business',
  ',',
  'according',
  'to',
  'the',
  'business',
  'district',
  '’',
  's',
  'governing',
  'body',
  '.',
  'the',
  'corporation',
  'of',
  'london',
  '’',
  's',
  'latest',
  'research',
  'into',
  'the',
  'ranking',
  'of',
  'financial',
  'centre',
  ',',
  'published',
  'yesterday',
  ',',
  'also',
  'suggests',
  'china',
  '’',
  's',
  'increasing',
  'economic',
  'power',
  'make',
  'it',
  'likely',
  'to',
  'host',
  'any',
  'new',
  'global',
  'financial',
  'service',
  'centre',
  'in',
  'future',
  'year',

## 1e. Noun Phrase Extraction

In [19]:
# import of the RegexpParser library from nltk
from nltk import RegexpParser
# definition of the grammer
grammer = r"""
  NP: {<PP\$>?<JJ>*<NN>}   
      {<NNP>+}                
"""
# definition of the parser
parser = RegexpParser(grammer)
# tagging for tokens
tagged = nltk.pos_tag(tokens)
# use the parser to parse the tagged words
parse = parser.parse(tagged)
# define an empty list to put in the noun phrases
entity = []
for i in parse:
    # define an empty string variable
    entity_str = ""
    # check if it is a noun phrase that suits the requirements
    if type(i) == Tree:
        for token,pos in i.leaves():
            # put the words in the named entity into the string
            entity_str += str(token)
            # add an empty space between each word
            entity_str += " "
        # replace the incorrect signs
        entity_str = entity_str.replace("’", "")
        # strip the empty space in the string
        entity_str = entity_str.rstrip().lstrip()
        # append the final string to the named entity list
        # if entity_str.split()[0] != "s":
        entity.append(entity_str)
print(entity)

['London', 'pre-eminence', 'financial centre', 'availability', 'skilled staff', 'flexible labour', 'light regulation', 'City', 'place', 'business', 'business', 'district', '', 's', 'body', 'Corporation', 'London', 'research', 'ranking', 'yesterday', 'China', 'economic power', 'centre', 'previous study', 'London', 'New York', 'Frankfurt', 'Paris', 'competitiveness', 'industry', 'year', 'London', 'chosen', 'location', 'global financial centre', 'New York', 'previous survey', 'report', '“ neck', 'neck', '”', 'Paris', 'Frankfurt', 'pre-eminence', 'City', 'Wall Street', 'anything', '”', 'Michael Snyder', 'corporation', '', 's', 'chairman', 'policy', 'one', 'survey', 'London', 'New York', 'scope', 'China', 'Shanghai', 'Tokyo', 'previous importance', 'bureau-cracy', 'poor regulation', 'report', '', 'room', 'London', 'New York', 'future', 'example', 'survey', 'personal tax', 'concern', 'financial sector', 'moment', 'respondent', 'City', 'shortage', 'survey', 'London', 'expensive location', 'li

## 1f. Named Entity Extraction

In [20]:
# do the pos tagging process, get the identity for each word after stop words removing
chunks = nltk.ne_chunk(tagged)
# create an empty list to store the named entities
entity2 = []
for i in chunks:
    # define an empty string variable
    entity_str = ""
    # check if the chunk is a named entity
    if type(i) == Tree:
        for token,pos in i.leaves():
            # put the words in the named entity into the string
            entity_str += str(token)
            # add an empty space between each word
            entity_str += " "
        # strip the empty space in the string
        entity_str = entity_str.rstrip()
        # append the final string to the named entity list
        entity2.append(entity_str)
for i in chunks:
    if type(i) != Tree:
        print(i)

('must', 'MD')
('not', 'RB')
('be', 'VB')
('complacent', 'JJ')
('about', 'IN')
('its', 'PRP$')
('continuing', 'VBG')
('pre-eminence', 'NN')
('as', 'IN')
('a', 'DT')
('financial', 'JJ')
('centre', 'NN')
('but', 'CC')
('the', 'DT')
('availability', 'NN')
('of', 'IN')
('skilled', 'JJ')
('staff', 'NN')
(',', ',')
('flexible', 'JJ')
('labour', 'NN')
('markets', 'NNS')
('and', 'CC')
('comparatively', 'RB')
('light', 'JJ')
('regulation', 'NN')
('now', 'RB')
('make', 'VBP')
('the', 'DT')
('City', 'NNP')
('the', 'DT')
('best', 'JJS')
('place', 'NN')
('for', 'IN')
('global', 'JJ')
('institutions', 'NNS')
('to', 'TO')
('do', 'VB')
('business', 'NN')
(',', ',')
('according', 'VBG')
('to', 'TO')
('the', 'DT')
('business', 'NN')
('district', 'NN')
('’', 'NNP')
('s', 'NN')
('governing', 'VBG')
('body', 'NN')
('.', '.')
('The', 'DT')
('’', 'NNP')
('s', 'VBD')
('latest', 'JJS')
('research', 'NN')
('into', 'IN')
('the', 'DT')
('ranking', 'NN')
('of', 'IN')
('financial', 'JJ')
('centres', 'NNS')
(',', ',

## 1g. Overall Preprocessing

"python -m spacy download en_core_web_sm" is needed for anaconda prompt (or other terminals) for the en_core_web_sm from spacy.

In [21]:
#overall preprocessing process(together with the word embedding part in the next cell)

#import of the needed packages
#if not installed, pip install them
import os
from string import punctuation
import re
import nltk
#nltk.download()
from nltk.corpus import stopwords
import spacy

#path where we store the txt files
#change the path to your own path where you store your own test files
path = r"DatasetEnglish/"
#define the files as all files under the certain path
files = os.listdir(path)

# make a work directory
try:
    os.mkdir("Work/")
except:
    pass

#go through the files in files
for file in files:
    #print out the name of the file that is being processed now
    print(file)
    #check if the file is a txt file
    if file.endswith(".txt"):
        #if txt file satisfied, open it and read it into the text space
        text = open(path + file, errors = 'ignore', encoding = 'utf-8').read() + '\n\n'

        #preprocessing: change all words to lowercase
        text = text.lower()

        #preprocessing: remove the punctuations
        puncs = punctuation + u'.,;《》？！“”‘’@#￥%…&×（）——+【】{};；●，。&～、|\s:：'
        text = re.sub(r"[{}]+".format(puncs)," ",text)

        #preprocessing: remove the numbers 
        text = re.sub(r"[0-9]+", "", text)

        #preprocessing: word segmentation
        #we use word_tokenize from nltk to do the word segmentation
        tokens = nltk.word_tokenize(text)

        #preprocessing: remove the stop words 
        #define a list to store the words that are not stop words
        tokens_stop_remove = []
        #go through the words in tokens
        for i in range(0, int(len(tokens))):
            #if not a stop word, append it to the new list
            if tokens[i] not in set(stopwords.words('english')):
                tokens_stop_remove.append(tokens[i])

        #preprocessing: put the words after removing stop-words to new text space
        text_new = ""
        for i in tokens_stop_remove:
            text_new += i
            text_new += " "
        text_new = text_new.rstrip()

        #preprocessing: lemmatizing
        #initialize the English space
        sp = spacy.load('en_core_web_sm')
        sp.max_length = 4000000
        doc = sp(text_new)
        #initialize a string to store the final text after lemmatize
        text_final = ""
        for token in doc:
            #store the token after lemmatize, like "good" for the token "better"
            lemma_token = token.lemma_
            #add the token after lemmatize to the final string storing the lemmatized token
            text_final += lemma_token
            #add a blank space between each word
            text_final += " "
        #open a new file and write the words into it
        #change the path to your own path where you want to store the words.txt file 
        with open ("Work/wordsEnglish.txt", 'a', encoding = 'utf-8') as f:
            #a single line of words for each file 
            text_write = text_final + '\n'
            #write the words into the file
            f.write(text_write)

1001571.txt
1005614.txt
1005617.txt
1005672.txt
1005673.txt
1005674.txt
1005676.txt
1005677.txt
1005678.txt
1005679.txt
1005680.txt
1005681.txt
1005703.txt
1005704.txt
1005706.txt
1005707.txt
1005708.txt
1005710.txt
1005713.txt
1005714.txt
1005716.txt
1005736.txt
1005737.txt
1005738.txt
1005739.txt
1005740.txt
1005741.txt
1005742.txt
1005743.txt
1005744.txt
1005746.txt
1005748.txt
1005749.txt
1005756.txt
1005764.txt
1005765.txt
1005766.txt
1005767.txt
1005768.txt
1005769.txt
1005770.txt
1005771.txt
1005772.txt
1005773.txt
1005775.txt
1005776.txt
1005779.txt
1005780.txt
1005782.txt
1005793.txt
1005794.txt
1005795.txt
1005796.txt
1005797.txt
1005798.txt
1005799.txt
1005802.txt
1005803.txt
1005805.txt
1005809.txt
1005810.txt
1005813.txt
1005815.txt
1005816.txt
1005817.txt
1005818.txt
1005824.txt
1005825.txt
1005826.txt
1005827.txt
1005828.txt
1005830.txt
1005831.txt
1005835.txt
1005836.txt
1005837.txt
1005838.txt
1005839.txt
1005845.txt
1005846.txt
1005849.txt
1005850.txt
1005851.txt
1005

## 1h. Word Embedding

In [22]:
#import of word2vec library
import gensim
from gensim.models import word2vec
from gensim.models import Word2Vec
#load the text file word corpus
#change the path to your own path where you store the words.txt file generated above
corpus = word2vec.Text8Corpus("Work\\wordsEnglish.txt")
#initialize the model, also the word embedding process
model = Word2Vec(corpus, vector_size=100, window = 5, \
                              min_count = 2, workers = 4)
#save the model into a file
model.save("models.bin")
#load the model
gensim.models.Word2Vec.load("models.bin")
#get the similarity for two words using the model
print(model.wv.similarity("cat", "dog"))

0.8147048


get the 10 most similar words for some word, like "science" in this case

In [23]:
print(model.wv.most_similar("science"))

[('biology', 0.7422332167625427), ('mit', 0.7419455051422119), ('scientific', 0.7375688552856445), ('entrepreneurship', 0.7370805144309998), ('scientist', 0.7237802743911743), ('teaching', 0.719723641872406), ('advancement', 0.7190641164779663), ('physics', 0.7185468673706055), ('excellence', 0.7156362533569336), ('sciences', 0.7138203382492065)]


# 2. Preprocessing for Chinese Dataset

Read the sample data from the dataset. 

In [24]:
path_here = r'DatasetChinese/'
f = open(path_here + '1001571.txt', encoding = 'utf-8')
text = f.read()
text

'伦敦商业区的管理机构表示，尽管伦敦一直都是卓越的金融中心，但它决不能自满，不过伦敦拥有熟练的员工，灵活的劳动力市场，以及比较宽松的监管，伦敦金融城因此成为目前全球机构经营的最佳地点。  昨天，伦敦金融城当局(Corporation of London)公布了对各金融中心排名的最新研究，该研究还表示，由于中国的经济力量日益增强，未来数年，很可能出现新的全球金融服务中心。  按照400名业内人士指定的评估标准(这些标准被视为对竞争力很重要)，相比之前2003年的研究，伦敦和纽约相比法兰克福和巴黎的领先优势更大了。  今年，对那些需要全球金融中心的交易而言，伦敦成为首选地的可能性稍大，而之前的调查是纽约位居榜首。但这次的报告说，两城市将继续“并驾齐驱”。  “尽管巴黎与法兰克福是极为重要的区域金融中心……但相比过去，伦敦金融城与华尔街的声誉却更加卓著，”金融城当局政策与资源委员会主席迈克尔 斯奈德(Michael Snyder)说。  在受访者中，没人认为伦敦和纽约会在10年内失去全球金融中心的地位。但对于是否会出现第三个全球金融中心，受访者的意见存在分歧。多数人觉得，如果真的出现第三个，将会在中国，很可能是上海。受访者认为，由于官僚作风和监管不力，东京不可能恢复像以前那样的重要地位。  报告作者警告说，伦敦与纽约的实力通常都是历史因素造成的，因此它们没有理由自满，因为将来可能出现全新的有力挑战。  例如，该调查表示，未来三年，企业与个人税收制度很可能成为金融业更大的担忧，尽管多数都在寻找应对办法。  一名受访者表示，金融城的衍生品业务技能不足。而且调查显示，人们认为伦敦是个成本很高的地方。  但有人担心，伦敦相对宽松的监管负担正在加重，不过由于伦敦的雇员具有灵活性，素质优异，再加上人们认为欧盟其它地区的监管更成问题，因此这种担心也就减轻了。  技术和外包，尤其是向成本更低廉的亚洲城市外包，给两座头号城市金融业职位带来另一个威胁。  但在一家总部设在伦敦的投资银行，一名交易负责人表达了多数受访者的观点，他解释了为何离岸和外包业务不会对伦敦和纽约构成挑战，它们仍是中心业务选的最佳选址。  “没人能单方面转移流动资产，因此一旦建立了伦敦或纽约这样的全球中心，要搬迁几乎是不可能的，”他说道。'

## 2a. Sentence Segmentation

In [25]:
sentences = re.split('(。|！|\!|\.|？|\?)',text)
for i in sentences:
    if i != "。":
        print([i])

['伦敦商业区的管理机构表示，尽管伦敦一直都是卓越的金融中心，但它决不能自满，不过伦敦拥有熟练的员工，灵活的劳动力市场，以及比较宽松的监管，伦敦金融城因此成为目前全球机构经营的最佳地点']
['  昨天，伦敦金融城当局(Corporation of London)公布了对各金融中心排名的最新研究，该研究还表示，由于中国的经济力量日益增强，未来数年，很可能出现新的全球金融服务中心']
['  按照400名业内人士指定的评估标准(这些标准被视为对竞争力很重要)，相比之前2003年的研究，伦敦和纽约相比法兰克福和巴黎的领先优势更大了']
['  今年，对那些需要全球金融中心的交易而言，伦敦成为首选地的可能性稍大，而之前的调查是纽约位居榜首']
['但这次的报告说，两城市将继续“并驾齐驱”']
['  “尽管巴黎与法兰克福是极为重要的区域金融中心……但相比过去，伦敦金融城与华尔街的声誉却更加卓著，”金融城当局政策与资源委员会主席迈克尔 斯奈德(Michael Snyder)说']
['  在受访者中，没人认为伦敦和纽约会在10年内失去全球金融中心的地位']
['但对于是否会出现第三个全球金融中心，受访者的意见存在分歧']
['多数人觉得，如果真的出现第三个，将会在中国，很可能是上海']
['受访者认为，由于官僚作风和监管不力，东京不可能恢复像以前那样的重要地位']
['  报告作者警告说，伦敦与纽约的实力通常都是历史因素造成的，因此它们没有理由自满，因为将来可能出现全新的有力挑战']
['  例如，该调查表示，未来三年，企业与个人税收制度很可能成为金融业更大的担忧，尽管多数都在寻找应对办法']
['  一名受访者表示，金融城的衍生品业务技能不足']
['而且调查显示，人们认为伦敦是个成本很高的地方']
['  但有人担心，伦敦相对宽松的监管负担正在加重，不过由于伦敦的雇员具有灵活性，素质优异，再加上人们认为欧盟其它地区的监管更成问题，因此这种担心也就减轻了']
['  技术和外包，尤其是向成本更低廉的亚洲城市外包，给两座头号城市金融业职位带来另一个威胁']
['  但在一家总部设在伦敦的投资银行，一名交易负责人表达了多数受访者的观点，他解释了为何离岸和外包业务不会对伦敦和纽约构成挑战，它们仍是中心业务选的最佳选址']
['  “没人能单方面转移流动资产，因此一旦建立了伦敦或

## 2b. Words Segmentation

In [26]:
words = jieba.lcut(text)
words

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\DELL\AppData\Local\Temp\jieba.cache
Loading model cost 0.667 seconds.
Prefix dict has been built successfully.


['伦敦',
 '商业区',
 '的',
 '管理机构',
 '表示',
 '，',
 '尽管',
 '伦敦',
 '一直',
 '都',
 '是',
 '卓越',
 '的',
 '金融中心',
 '，',
 '但',
 '它',
 '决不能',
 '自满',
 '，',
 '不过',
 '伦敦',
 '拥有',
 '熟练',
 '的',
 '员工',
 '，',
 '灵活',
 '的',
 '劳动力',
 '市场',
 '，',
 '以及',
 '比较',
 '宽松',
 '的',
 '监管',
 '，',
 '伦敦',
 '金融城',
 '因此',
 '成为',
 '目前',
 '全球',
 '机构',
 '经营',
 '的',
 '最佳',
 '地点',
 '。',
 ' ',
 ' ',
 '昨天',
 '，',
 '伦敦',
 '金融城',
 '当局',
 '(',
 'Corporation',
 ' ',
 'of',
 ' ',
 'London',
 ')',
 '公布',
 '了',
 '对',
 '各',
 '金融中心',
 '排名',
 '的',
 '最新',
 '研究',
 '，',
 '该',
 '研究',
 '还',
 '表示',
 '，',
 '由于',
 '中国',
 '的',
 '经济',
 '力量',
 '日益',
 '增强',
 '，',
 '未来',
 '数年',
 '，',
 '很',
 '可能',
 '出现',
 '新',
 '的',
 '全球',
 '金融服务',
 '中心',
 '。',
 ' ',
 ' ',
 '按照',
 '400',
 '名',
 '业内人士',
 '指定',
 '的',
 '评估',
 '标准',
 '(',
 '这些',
 '标准',
 '被',
 '视为',
 '对',
 '竞争力',
 '很',
 '重要',
 ')',
 '，',
 '相比',
 '之前',
 '2003',
 '年',
 '的',
 '研究',
 '，',
 '伦敦',
 '和',
 '纽约',
 '相比',
 '法兰克福',
 '和',
 '巴黎',
 '的',
 '领先',
 '优势',
 '更大',
 '了',
 '。',
 ' ',
 ' ',
 '今年',
 '，',
 '对',
 '那些',
 '需要'

## 2c. Stop-words Removing 

In [30]:
# see the stopwords through a file of previous-generated stopwords
f_stop = open('StopwordsChinese.txt', encoding = 'utf-8')
text_stop = f_stop.read()
stopwords = text_stop.split("\n")
stopwords

['打开天窗说亮话',
 '到目前为止',
 '赶早不赶晚',
 '常言说得好',
 '何乐而不为',
 '毫无保留地',
 '由此可见',
 '这就是说',
 '这么点儿',
 '综上所述',
 '总的来看',
 '总的来说',
 '总的说来',
 '总而言之',
 '相对而言',
 '除此之外',
 '反过来说',
 '恰恰相反',
 '如上所述',
 '换句话说',
 '具体地说',
 '具体说来',
 '另一方面',
 '与此同时',
 '一则通过',
 '毫无例外',
 '不然的话',
 '从此以后',
 '从古到今',
 '从古至今',
 '从今以后',
 '大张旗鼓',
 '从无到有',
 '从早到晚',
 '弹指之间',
 '不亦乐乎',
 '不知不觉',
 '不止一次',
 '不择手段',
 '不可开交',
 '不可抗拒',
 '不仅仅是',
 '不管怎样',
 '挨家挨户',
 '长此下去',
 '长话短说',
 '除此而外',
 '除此以外',
 '除此之外',
 '得天独厚',
 '川流不息',
 '长期以来',
 '挨门挨户',
 '挨门逐户',
 '多多少少',
 '多多益善',
 '二话不说',
 '更进一步',
 '二话没说',
 '分期分批',
 '风雨无阻',
 '归根到底',
 '归根结底',
 '反之亦然',
 '大面儿上',
 '倒不如说',
 '成年累月',
 '换句话说',
 '或多或少',
 '简而言之',
 '接连不断',
 '尽如人意',
 '尽心竭力',
 '尽心尽力',
 '尽管如此',
 '据我所知',
 '具体地说',
 '具体来说',
 '具体说来',
 '近几年来',
 '每时每刻',
 '屡次三番',
 '三番两次',
 '三番五次',
 '三天两头',
 '另一方面',
 '老老实实',
 '年复一年',
 '恰恰相反',
 '顷刻之间',
 '穷年累月',
 '千万千万',
 '日复一日',
 '如此等等',
 '如前所述',
 '如上所述',
 '一方面',
 '切不可',
 '顷刻间',
 '全身心',
 '另方面',
 '另一个',
 '猛然间',
 '默默地',
 '就是说',
 '近年来',
 '尽可能',
 '接下来',
 '简言之',
 '急匆匆',
 '即是说',
 '基本上',


In [31]:
tokens_stop_remove = [word for word in words if not word in stopwords]

# print the tokens after removing the stop words
tokens_stop_remove

['伦敦',
 '商业区',
 '管理机构',
 '表示',
 '，',
 '伦敦',
 '卓越',
 '金融中心',
 '，',
 '决不能',
 '自满',
 '，',
 '伦敦',
 '拥有',
 '熟练',
 '员工',
 '，',
 '灵活',
 '劳动力',
 '市场',
 '，',
 '比较',
 '宽松',
 '监管',
 '，',
 '伦敦',
 '金融城',
 '成为',
 '目前',
 '全球',
 '机构',
 '经营',
 '最佳',
 '地点',
 '。',
 ' ',
 ' ',
 '昨天',
 '，',
 '伦敦',
 '金融城',
 '当局',
 '(',
 'Corporation',
 ' ',
 'of',
 ' ',
 'London',
 ')',
 '公布',
 '对',
 '各',
 '金融中心',
 '排名',
 '最新',
 '研究',
 '，',
 '该',
 '研究',
 '表示',
 '，',
 '中国',
 '经济',
 '力量',
 '增强',
 '，',
 '未来',
 '数年',
 '，',
 '出现',
 '新',
 '全球',
 '金融服务',
 '中心',
 '。',
 ' ',
 ' ',
 '400',
 '名',
 '业内人士',
 '指定',
 '评估',
 '标准',
 '(',
 '标准',
 '被',
 '视为',
 '对',
 '竞争力',
 '重要',
 ')',
 '，',
 '相比',
 '之前',
 '2003',
 '年',
 '研究',
 '，',
 '伦敦',
 '和',
 '纽约',
 '相比',
 '法兰克福',
 '和',
 '巴黎',
 '领先',
 '优势',
 '更大',
 '。',
 ' ',
 ' ',
 '今年',
 '，',
 '对',
 '需要',
 '全球',
 '金融中心',
 '交易',
 '，',
 '伦敦',
 '成为',
 '首选',
 '地',
 '可能性',
 '稍大',
 '，',
 '而',
 '之前',
 '调查',
 '纽约',
 '位居',
 '榜首',
 '。',
 '这次',
 '报告',
 '说',
 '，',
 '两',
 '城市',
 '将',
 '继续',
 '“',
 '并驾齐驱',
 '”',
 '。'

## 2d. Lemmatization and Stemming

This part is ignored as Chinese words are hard to do actual lemmatization because of the particularity of the language. 

## 2e. Noun Phrase Extraction

In [45]:
import jieba.posseg as pseg
words_type = pseg.lcut(text)
words_type

[pair('伦敦', 'ns'),
 pair('商业区', 'n'),
 pair('的', 'uj'),
 pair('管理机构', 'n'),
 pair('表示', 'v'),
 pair('，', 'x'),
 pair('尽管', 'c'),
 pair('伦敦', 'ns'),
 pair('一直', 'd'),
 pair('都', 'd'),
 pair('是', 'v'),
 pair('卓越', 'nr'),
 pair('的', 'uj'),
 pair('金融中心', 'n'),
 pair('，', 'x'),
 pair('但', 'c'),
 pair('它', 'r'),
 pair('决不能', 'v'),
 pair('自满', 'a'),
 pair('，', 'x'),
 pair('不过', 'c'),
 pair('伦敦', 'ns'),
 pair('拥有', 'v'),
 pair('熟练', 'a'),
 pair('的', 'uj'),
 pair('员工', 'n'),
 pair('，', 'x'),
 pair('灵活', 'a'),
 pair('的', 'uj'),
 pair('劳动力', 'n'),
 pair('市场', 'n'),
 pair('，', 'x'),
 pair('以及', 'c'),
 pair('比较', 'd'),
 pair('宽松', 'a'),
 pair('的', 'uj'),
 pair('监管', 'vn'),
 pair('，', 'x'),
 pair('伦敦', 'ns'),
 pair('金融城', 'nr'),
 pair('因此', 'c'),
 pair('成为', 'v'),
 pair('目前', 't'),
 pair('全球', 'n'),
 pair('机构', 'n'),
 pair('经营', 'vn'),
 pair('的', 'uj'),
 pair('最佳', 'z'),
 pair('地点', 'n'),
 pair('。', 'x'),
 pair(' ', 'x'),
 pair(' ', 'x'),
 pair('昨天', 't'),
 pair('，', 'x'),
 pair('伦敦', 'ns'),
 pair('

## 2f. Named Entity Extraction

In [60]:
# create an empty list to store the named entities
entity2 = []
for i in pseg.cut(text):
    if tuple(i)[1] == ('ns' or 'nz' or 'nrt' or 'nr'):
        entity2.append(tuple(i)[0])
entity2 = list(set(entity2))
entity2

['亚洲', '中国', '伦敦', '东京', '城市', '巴黎', '纽约', '上海', '华尔街']

## 2g. Word Embedding

TF-IDF is used. 

In [62]:
#use tf-idf to get the 5 most representative words for each test file 
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer

# append text to corpus
corpus = []
corpus.append(text)

#initialize of the tf-idf transformer
transformer = TfidfTransformer()
#initialize of the vectorizer of words
vectorizer = CountVectorizer()
#vectorize the corpus
vector = vectorizer.fit_transform(corpus)
#calculate the tf-idf for all words
tfidf = transformer.fit_transform(vector)

## 2h. Overall Preprocessing

In [65]:
#overall preprocessing process(together with the word embedding part in the next cell)

#import of the needed packages
#if not installed, pip install them
import os
from string import punctuation
import re
import nltk
#nltk.download()
from nltk.corpus import stopwords
import spacy

#path where we store the txt files
#change the path to your own path where you store your own test files
path = r"DatasetChinese/"
#define the files as all files under the certain path
files = os.listdir(path)

# make a work directory
try:
    os.mkdir("Work/")
except:
    pass

#go through the files in files
for file in files:
    #print out the name of the file that is being processed now
    print(file)
    #check if the file is a txt file
    if file.endswith(".txt"):
        #if txt file satisfied, open it and read it into the text space
        text = open(path + file, errors = 'ignore', encoding = 'utf-8').read() + '\n\n'
        #preprocessing: remove the punctuations
        puncs = punctuation + u'.,;《》？！“”‘’@#￥%…&×（）——+【】{};；●，。&～、|\s:：'
        text = re.sub(r"[{}]+".format(puncs)," ",text)

        #preprocessing: remove the numbers 
        text = re.sub(r"[0-9]+", "", text)

        #preprocessing: word segmentation
        #we use jieba library and the lcut function to get the word tokens
        tokens = jieba.lcut(text)
        
        #make cuts to get the part of speech for each word
        cuts = pseg.cut(text)   
        #get the list of stopwords
        #the stopwords txt file downloaded from github is included in our file submission
        #the following path is where you stop the stopwords txt file
        path2 = r'StopwordsChinese.txt'
        #get the list of stopwords
        stopwords = [line.strip() for line in open(path2, encoding="utf-8").readlines()]
        #create a list to store the words
        words = []
        for word, flag in cuts:
            #print(flag)
            #if the word is not proper noun and not a stopword, insert it into the words list
            if flag != ('ns' or 'nz' or 'nr' or 'nrt'):
                if word not in stopwords:
                    words.append(word)
        #get the final text by inserting the words in the word list in our final text
        text_final = ""
        for i in words:
            text_final += i
            text_final += " "
        #open a new file and write the words into it
        #change the path to your own path where you want to store the words.txt file 
        with open ("Work/wordsChinese.txt", 'a', encoding = 'utf-8') as f:
            #a single line of words for each file 
            text_write = text_final + '\n'
            #write the words into the file
            f.write(text_write)

1001571.txt
1005614.txt
1005617.txt
1005672.txt
1005673.txt
1005674.txt
1005676.txt
1005677.txt
1005678.txt
1005679.txt
1005680.txt
1005681.txt
1005703.txt
1005704.txt
1005706.txt
1005707.txt
1005708.txt
1005710.txt
1005713.txt
1005714.txt
1005716.txt
1005736.txt
1005737.txt
1005738.txt
1005739.txt
1005740.txt
1005741.txt
1005742.txt
1005743.txt
1005744.txt
1005746.txt
1005748.txt
1005749.txt
1005756.txt
1005764.txt
1005765.txt
1005766.txt
1005767.txt
1005768.txt
1005769.txt
1005770.txt
1005771.txt
1005772.txt
1005773.txt
1005775.txt
1005776.txt
1005779.txt
1005780.txt
1005782.txt
1005793.txt
1005794.txt
1005795.txt
1005796.txt
1005797.txt
1005798.txt
1005799.txt
1005802.txt
1005803.txt
1005805.txt
1005809.txt
1005810.txt
1005813.txt
1005815.txt
1005816.txt
1005817.txt
1005818.txt
1005824.txt
1005825.txt
1005826.txt
1005827.txt
1005828.txt
1005830.txt
1005831.txt
1005835.txt
1005836.txt
1005837.txt
1005838.txt
1005839.txt
1005845.txt
1005846.txt
1005849.txt
1005850.txt
1005851.txt
1005