# 개발용 10문제에 대해서 답안을 제시함

# 패키지 로드

In [1]:
from itertools import islice # for print head
import re # 정규표현식 모듈
from pandas import DataFrame as df # dataframe 을 활용하기 위함
import pandas as pd
from operator import eq
import math
import numpy as np

# 3. TF-IDF of queries

각 키워드별 TF-IDF 를 구함

In [2]:
def get_queries_tf(queries):
    # 검색어 소문자로 변경
    for i in range(0, len(queries)):
        queries[i] = queries[i].lower()

    # 검색쿼리 TF(word count)
    queries = df(data = {'token':queries})
    queries_tf = queries.groupby(['token'], as_index=False).size().reset_index().rename(columns={0:'count'})

    # 검색쿼리 정규화
    sum_tf = queries_tf["count"].sum()
    queries_tf["tf_norm"] = queries_tf["count"]/sum_tf

    return queries_tf

In [3]:
def loadIdf():
    # idf 는 문서를 대상으로 한 idf 값을 사용함
    chapterspath = "data/IDF.xlsx"
    idf = pd.read_excel(chapterspath)
    return idf

In [4]:
def get_quries_tfidf(queries):
    # TF 계산
    queries_tf = get_queries_tf(queries)
    
    #idf 로드
    idf = loadIdf()
    
    queries_tf_idf = pd.merge(queries_tf, idf, how='left', on=['token', 'token'])
    queries_tf_idf["tf_idf"] = queries_tf_idf["tf_norm"]*queries_tf_idf["idf"]
    
    return queries_tf_idf

In [5]:
# 검색어의 TF
# 문제 1번
q1 = ["Darcy", "Darcy", "poetry", "food", "love"]
q1_tfidf = get_quries_tfidf(q1)
q1_tfidf

Unnamed: 0,token,count,tf_norm,numberOfChap,idf,tf_idf
0,darcy,2,0.4,9,1.200671,0.480268
1,food,1,0.2,1,3.397895,0.679579
2,love,1,0.2,7,1.451985,0.290397
3,poetry,1,0.2,1,3.397895,0.679579


# 4. Vector Space Model ( Cosine Similarity )

## 4.1 Modeling (함수 구현)

In [6]:
# load tf_idf
def load_tf_idf_doc():
    # 문서의 tf_idf 로드
    tf_idf_docpath = "data/tf_idf.xlsx"
    tf_idf_doc = pd.read_excel(tf_idf_docpath)
    tf_idf_doc = tf_idf_doc.rename(columns={'tf_idf':'tf_idf_doc'}) #join 시에 이름이 겹치는 것을 방지
    
    return tf_idf_doc

In [7]:
# param: 쿼리의 tf_idf, 문서의 tf_idf
def get_dot(q1_tfidf, tf_idf_doc):
    
    # 주어진 쿼리의 tf idf 와 tf_idf_doc 사이의 cos similairity 계산
    # 문서에 있는 단어의 tf_idf 를 가져옴(left join)
    megedtfidf = pd.merge(q1_tfidf, tf_idf_doc, how='left', on=['token', 'token'])

    # Dot product(Query, Document) 
    # 문서와 쿼리의 단어들의 tf*idf 값을 곱하여 dot 을 생성해냄
    megedtfidf["dot"] = megedtfidf["tf_idf"]*megedtfidf["tf_idf_doc"]

    # 필요한 정보만 따로 빼낸다
    chapters = megedtfidf["chapter"]
    token = megedtfidf["token"]
    dot = megedtfidf["dot"]
    term_dotproducts = df(data = {'chapter':chapters, 'token':token, 'dot':dot})
    term_dotproducts

    # 문서별로 각 모든 단어의 dot 값들을 합산함 
    # Dot project 합산
    dot_sums = term_dotproducts.groupby(['chapter'], as_index=False)["dot"].sum()
    
    return dot_sums

In [8]:
# ||Query||
def get_abs_query(q1_tfidf):
    query_abs = np.sqrt(np.sum(q1_tfidf["tf_idf"]*q1_tfidf["tf_idf"]))
    return query_abs

In [9]:
# ||Document n||
# 각 문서의 tfidf 값들을 모두 제곱한 후에, 합산하여준다.
# param: 쿼리의 tf_idf, 문서의 tf_idf
def get_abs_documents(q1_tfidf, tf_idf_doc):
    
    # 문서의 tf_idf 를 제곱해준다.
    megedtfidf = pd.merge(q1_tfidf, tf_idf_doc, how='left', on=['token', 'token'])
    megedtfidf["doc_abs"] = megedtfidf["tf_idf_doc"]*megedtfidf["tf_idf_doc"]
    
    # 챕터별로 제곱해준 문서의 tf_idf 값을 합해준다.
    summation = megedtfidf.groupby(['chapter'], as_index=False)["doc_abs"].sum()

    # tfidf 합산 후 sqrt를 취함.
    # chapter 정보를 유지하기위해 조금 코드가 복잡해짐
    chapters = summation["chapter"]
    absolutes = np.sqrt(summation["doc_abs"])
    doc_abs = df(data = {'chapter':chapters, 'doc_abs':absolutes})
    
    return df(doc_abs)

## 4.2 Sending Queries (질의 날리기)

In [10]:
def get_related_chapaters(keywords): 
    tf_idf_doc = load_tf_idf_doc() # 문서의 각단어의 tfidf 획득
    q1_tfidf = get_quries_tfidf(keywords) # 질의문의 tfidf 값을 계산

    # 질의문 1과 각 문서사이의 모든 cos similarity 구함
    # dot_sums / ||Query|| * ||Document n|| 수행
    dots = get_dot(q1_tfidf, tf_idf_doc)
    doc_abs = get_abs_documents(q1_tfidf, tf_idf_doc)
    query_abs = get_abs_query(q1_tfidf)
    vectorspace = pd.merge(dots, doc_abs, how='left', on=['chapter', 'chapter'])
    vectorspace["query_abs"] = query_abs
    vectorspace["cos_similarity"] = vectorspace["dot"] / ( vectorspace["doc_abs"] * vectorspace["query_abs"])
    vectorspace = vectorspace.sort_values(by=['cos_similarity'], ascending=False)
    
    return vectorspace

In [11]:
q={}
v={}
a={}

### 1.	In which chapter did Mr. Darcy think poetry is “food of love”?
Answer: Chapter 9 <br />
q1 = [Darcy, poetry, food, love]

In [12]:
q["result"+str(1)] = ["Darcy", "poetry", "food", "love"]
v["result"+str(1)] = get_related_chapaters(q["result"+str(1)])
a["result"+str(1)] = [9]
v["result"+str(1)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
6,9,0.016874,0.016755,1.290379,0.780471
5,7,0.000962,0.002041,1.290379,0.36503
4,6,0.007308,0.017422,1.290379,0.325092
8,13,0.000708,0.001952,1.290379,0.28131
0,2,0.001523,0.004196,1.290379,0.28131
1,3,0.003812,0.010661,1.290379,0.277114
7,11,0.007015,0.021098,1.290379,0.257676
2,4,0.004601,0.015328,1.290379,0.23262
3,5,0.00243,0.008094,1.290379,0.23262
9,15,0.001327,0.00442,1.290379,0.23262


### 2.	What chapter the militia regiment has arrived in Meryton for the winter, so Catherine and Lydia particularly liked it?
Answer: Chapter 7 <br />
q2 = [militia, regiment, arrived, Meryton, winter, Catherine, Lydia]

In [13]:
q["result"+str(2)] = ["militia", "regiment", "arrived", "meryton", "winter", "catherine", "lydia"]
v["result"+str(2)] = get_related_chapaters(q["result"+str(2)])
a["result"+str(2)] = [7]
v["result"+str(2)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
5,7,0.011088,0.016876,1.003746,0.654584
1,3,0.002254,0.004441,1.003746,0.505651
6,9,0.002409,0.005223,1.003746,0.459412
8,15,0.002969,0.00778,1.003746,0.380215
9,17,0.003049,0.008105,1.003746,0.374819
7,13,0.00388,0.012637,1.003746,0.305923
3,5,0.00203,0.009789,1.003746,0.206652
0,2,0.001741,0.008393,1.003746,0.206652
2,4,0.000641,0.003089,1.003746,0.206652
4,6,0.00086,0.004145,1.003746,0.206652


### 3.	Which chapter has Mr. Collins’s letter to Mr. Bennet?
Answer: Chapter 13 <br />
q3 = [Mr, Collins, letter, Bennet]


In [14]:
q["result"+str(3)] = ["mr", "collins", "letter", "bennet"]
v["result"+str(3)] = get_related_chapaters(q["result"+str(3)])
a["result"+str(3)] = [13]
v["result"+str(3)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
8,13,0.043096,0.041009,1.084908,0.968641
9,15,0.025025,0.041938,1.084908,0.550009
10,17,0.0163,0.037239,1.084908,0.403471
6,9,0.007417,0.02105,1.084908,0.324758
5,7,0.00623,0.017769,1.084908,0.323146
2,4,0.003723,0.010638,1.084908,0.322608
0,2,0.015173,0.044117,1.084908,0.317021
3,5,0.00618,0.018117,1.084908,0.314401
1,3,0.010937,0.033001,1.084908,0.305491
7,11,0.00572,0.018306,1.084908,0.288012


### 4.	What chapter did Mr. Bingley dance with Jane Bennet twice in the party?
Answer: Chapter 3 <br />
q4= [Mr, Bingley, dance, Jane, Bennet, twice, party]


In [15]:
q["result"+str(4)] = ["mr", "bingley", "dance", "jane", "bennet", "twice", "party"]
v["result"+str(4)] = get_related_chapaters(q["result"+str(4)])
a["result"+str(4)] = [3]
v["result"+str(4)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
1,3,0.019196,0.043126,0.585927,0.759668
3,5,0.00871,0.021618,0.585927,0.687623
4,6,0.006558,0.019404,0.585927,0.576814
7,11,0.007288,0.022565,0.585927,0.551205
5,7,0.006686,0.023317,0.585927,0.489375
6,9,0.007981,0.028435,0.585927,0.479043
8,13,0.005683,0.021252,0.585927,0.456408
0,2,0.012431,0.048715,0.585927,0.435514
2,4,0.006477,0.025998,0.585927,0.425166
10,17,0.009473,0.03812,0.585927,0.424145


### 5.	Find the chapter that the ladies met Mr. Wickham at the first time in Meryton?
Answer: Chapter 15<br />
q5 = [ladies, met, Mr, Wickham, first, Meryton]


In [16]:
q["result"+str(5)] = ["ladies", "met", "mr", "wickham", "first", "meryton"]
v["result"+str(5)] = get_related_chapaters(q["result"+str(5)])
a["result"+str(5)] = [15]
v["result"+str(5)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
10,17,0.022846,0.047863,0.654111,0.729716
9,15,0.015168,0.036438,0.654111,0.636407
2,4,0.004357,0.010991,0.654111,0.606037
6,9,0.005696,0.017652,0.654111,0.493285
5,7,0.005879,0.018268,0.654111,0.49198
4,6,0.004961,0.015527,0.654111,0.488458
3,5,0.00544,0.01869,0.654111,0.444966
1,3,0.007537,0.031289,0.654111,0.368256
7,11,0.004262,0.018522,0.654111,0.351749
0,2,0.007418,0.037838,0.654111,0.299707


### 6.	In which chapter Miss Bingley persuade Elizabeth to take a turn about the room?
Answer: Chapter 11 <br />
q6 = [Miss, Bingley, persuade, Eizabeth, turn, room]


In [17]:
q["result"+str(6)] = ["miss", "bingley", "persuade", "eizabeth", "turn", "room"]
v["result"+str(6)] = get_related_chapaters(q["result"+str(6)])
a["result"+str(6)] = [11]
v["result"+str(6)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
7,11.0,0.010487,0.026714,0.650756,0.603272
8,13.0,0.002485,0.006745,0.650756,0.566111
0,2.0,0.00759,0.021965,0.650756,0.531016
4,6.0,0.005823,0.01778,0.650756,0.503238
9,15.0,0.002382,0.007547,0.650756,0.485079
6,9.0,0.005797,0.019223,0.650756,0.463388
1,3.0,0.007029,0.023495,0.650756,0.459741
5,7.0,0.003867,0.013137,0.650756,0.452344
2,4.0,0.007479,0.027148,0.650756,0.423347
10,17.0,0.001645,0.006448,0.650756,0.39211


### 7.	Which chapter said Charlotte Lucas danced with Bingley for the first time in the party?
Answer: Chapter 5 (Chapter 3) <br />
q7 = [Charlotte, Lucas, danced, Bingley, first, party]


In [18]:
q["result"+str(7)] = ["charlotte", "lucas", "danced", "bingley", "first", "party"]
v["result"+str(7)] = get_related_chapaters(q["result"+str(7)])
a["result"+str(7)] = [5, 3]
v["result"+str(7)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
4,6,0.008129,0.013886,0.705733,0.829536
1,3,0.017332,0.031124,0.705733,0.789053
3,5,0.016201,0.031482,0.705733,0.729186
8,13,0.003012,0.005927,0.705733,0.720009
6,9,0.007197,0.019545,0.705733,0.521799
7,11,0.005023,0.014243,0.705733,0.499777
9,15,0.001513,0.004491,0.705733,0.477326
10,17,0.00151,0.006233,0.705733,0.343268
2,4,0.004752,0.023864,0.705733,0.282134
0,2,0.00395,0.020477,0.705733,0.27331


### 8.	What chapter did the ladies go to Meryton with Mr. Collins to meet the relative Mr. Philips?
Answer: Chapter 15<br />
q8 = [ladies, Meryton, Mr, Collins, meet, relative, Phillips]


In [19]:
q["result"+str(8)] = ["ladies", "Meryton", "Mr", "Collins", "meet", "relative", "Phillips"]
v["result"+str(8)] = get_related_chapaters(q["result"+str(8)])
a["result"+str(8)] = [15]
v["result"+str(8)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
9,15.0,0.018824,0.042386,0.653981,0.679073
5,7.0,0.00825,0.020238,0.653981,0.623331
8,13.0,0.008032,0.023511,0.653981,0.522418
3,5.0,0.005577,0.019071,0.653981,0.447137
10,17.0,0.009916,0.037348,0.653981,0.405959
4,6.0,0.003612,0.015021,0.653981,0.367666
2,4.0,0.002221,0.009349,0.653981,0.3633
0,2.0,0.007534,0.038151,0.653981,0.301945
6,9.0,0.003092,0.016394,0.653981,0.288352
1,3.0,0.005571,0.031104,0.653981,0.273877


### 9.	Find the chapter that Mr. Collins request to dance with his cousins in upcoming ball. 
Answer: Chapter 17	<br />
q9 = [Mr, Collins, request, dance, cousins, ball]


In [20]:
q["result"+str(9)] = ["Mr", "Collins", "request", "dance", "cousins", "ball"]
v["result"+str(9)] = get_related_chapaters(q["result"+str(9)])
a["result"+str(9)] = [17]
v["result"+str(9)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
8,13,0.011202,0.024131,0.801138,0.579471
9,15,0.017624,0.041005,0.801138,0.536499
10,17,0.017682,0.041648,0.801138,0.52993
4,6,0.006213,0.016931,0.801138,0.458077
3,5,0.00578,0.018238,0.801138,0.395605
1,3,0.010271,0.03382,0.801138,0.379072
7,11,0.005281,0.019263,0.801138,0.342188
6,9,0.004416,0.0176,0.801138,0.313158
0,2,0.009309,0.038726,0.801138,0.300036
2,4,0.002166,0.009054,0.801138,0.298621


### 10.	Find the chapter in which Darcy said his pleasure mind about Elizabeth to Miss Bingley.
Answer: Chapter 6<br />
q10 = [Darcy, pleasure, Elizabeth, Miss, Bingley]


In [21]:
q["result"+str(10)] = ["Darcy", "pleasure", "Elizabeth", "Miss", "Bingley"]
v["result"+str(10)] = get_related_chapaters(q["result"+str(10)])
a["result"+str(10)] = [6]
v["result"+str(10)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
4,6,0.012478,0.026422,0.502984,0.938927
7,11,0.016637,0.036611,0.502984,0.903459
9,15,0.00444,0.010269,0.502984,0.859549
6,9,0.010428,0.024403,0.502984,0.849607
2,4,0.013319,0.031514,0.502984,0.840257
10,17,0.006594,0.016196,0.502984,0.8094
1,3,0.009435,0.023356,0.502984,0.803157
5,7,0.007674,0.019982,0.502984,0.763561
3,5,0.008026,0.020967,0.502984,0.761035
8,13,0.002119,0.005823,0.502984,0.723583


### 11.	In which chapter did Mrs. Bennet visit to Netherfield for Jane?
Answe: Chapter 9<br />
q11 = [Mrs, Bennet, visit, Netherfield, Jane]


In [22]:
q["result"+str(11)] = ["Mrs", "Bennet", "visit", "Netherfield", "Jane"]
v["result"+str(11)] = get_related_chapaters(q["result"+str(11)])
a["result"+str(11)] = [9]
v["result"+str(11)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
6,9,0.009781,0.021106,0.537438,0.862312
1,3,0.008036,0.018951,0.537438,0.789043
5,7,0.008039,0.019067,0.537438,0.784525
0,2,0.016126,0.038505,0.537438,0.779276
3,5,0.008369,0.020077,0.537438,0.775601
2,4,0.003865,0.009507,0.537438,0.75645
10,17,0.006028,0.014937,0.537438,0.750943
4,6,0.002934,0.007835,0.537438,0.696831
9,15,0.005998,0.016298,0.537438,0.684721
7,11,0.002057,0.005965,0.537438,0.64163


## Result
결과출력

In [23]:
for i in range(1, 12):
    # 문제번호
    v["result"+str(i)]["question_no"] = i
    
    # 순위매기기(index 재지정)
    v["result"+str(i)] = v["result"+str(i)].reset_index()

    # 기존의 index 열은 삭제
    v["result"+str(i)] = v["result"+str(i)].drop('index', 1)
    
    # 정답챕터
    v["result"+str(i)]["reference_chap"] = a["result"+str(i)][0]
    
    # 열의 순서를 바꿈
    v["result"+str(i)] = pd.DataFrame(v["result"+str(i)], 
                                      columns=['question_no','reference_chap', 'chapter','dot', 'doc_abs', 'query_abs', 'cos_similarity'])

v["result"+str(1)]

Unnamed: 0,question_no,reference_chap,chapter,dot,doc_abs,query_abs,cos_similarity
0,1,9,9,0.016874,0.016755,1.290379,0.780471
1,1,9,7,0.000962,0.002041,1.290379,0.36503
2,1,9,6,0.007308,0.017422,1.290379,0.325092
3,1,9,13,0.000708,0.001952,1.290379,0.28131
4,1,9,2,0.001523,0.004196,1.290379,0.28131
5,1,9,3,0.003812,0.010661,1.290379,0.277114
6,1,9,11,0.007015,0.021098,1.290379,0.257676
7,1,9,4,0.004601,0.015328,1.290379,0.23262
8,1,9,5,0.00243,0.008094,1.290379,0.23262
9,1,9,15,0.001327,0.00442,1.290379,0.23262


In [24]:
result = v["result"+str(1)]
for i in range(2, 12):
    result = result.append(v["result"+str(i)])

result

Unnamed: 0,question_no,reference_chap,chapter,dot,doc_abs,query_abs,cos_similarity
0,1,9,9.0,0.016874,0.016755,1.290379,0.780471
1,1,9,7.0,0.000962,0.002041,1.290379,0.365030
2,1,9,6.0,0.007308,0.017422,1.290379,0.325092
3,1,9,13.0,0.000708,0.001952,1.290379,0.281310
4,1,9,2.0,0.001523,0.004196,1.290379,0.281310
5,1,9,3.0,0.003812,0.010661,1.290379,0.277114
6,1,9,11.0,0.007015,0.021098,1.290379,0.257676
7,1,9,4.0,0.004601,0.015328,1.290379,0.232620
8,1,9,5.0,0.002430,0.008094,1.290379,0.232620
9,1,9,15.0,0.001327,0.004420,1.290379,0.232620


In [25]:
import pandas as pd
writer = pd.ExcelWriter('data/results_train.xlsx')
result.to_excel(writer,'Sheet1')
writer.save()