# 개발 후 평가를 위한 문제 11문제에 대한 답을 제안함

# 패키지 로드

In [2]:
from itertools import islice # for print head
import re # 정규표현식 모듈
from pandas import DataFrame as df # dataframe 을 활용하기 위함
import pandas as pd
from operator import eq
import math
import numpy as np

# 3. TF-IDF of queries

각 키워드별 TF-IDF 를 구함

In [3]:
def get_queries_tf(queries):
    # 검색어 소문자로 변경
    for i in range(0, len(queries)):
        queries[i] = queries[i].lower()

    # 검색쿼리 TF(word count)
    queries = df(data = {'token':queries})
    queries_tf = queries.groupby(['token'], as_index=False).size().reset_index().rename(columns={0:'count'})

    # 검색쿼리 정규화
    sum_tf = queries_tf["count"].sum()
    queries_tf["tf_norm"] = queries_tf["count"]/sum_tf

    return queries_tf

In [4]:
def loadIdf():
    # idf 는 문서를 대상으로 한 idf 값을 사용함
    chapterspath = "data/IDF.xlsx"
    idf = pd.read_excel(chapterspath)
    return idf

In [5]:
def get_quries_tfidf(queries):
    # TF 계산
    queries_tf = get_queries_tf(queries)
    
    #idf 로드
    idf = loadIdf()
    
    queries_tf_idf = pd.merge(queries_tf, idf, how='left', on=['token', 'token'])
    queries_tf_idf["tf_idf"] = queries_tf_idf["tf_norm"]*queries_tf_idf["idf"]
    
    return queries_tf_idf

In [6]:
# 검색어의 TF
# 문제 1번
q1 = ["Darcy", "Darcy", "poetry", "food", "love"]
q1_tfidf = get_quries_tfidf(q1)
q1_tfidf

Unnamed: 0,token,count,tf_norm,numberOfChap,idf,tf_idf
0,darcy,2,0.4,9,1.200671,0.480268
1,food,1,0.2,1,3.397895,0.679579
2,love,1,0.2,7,1.451985,0.290397
3,poetry,1,0.2,1,3.397895,0.679579


In [75]:
q1 = ["Elizabeth"]
q1_tfidf = get_quries_tfidf(q1)
q1_tfidf

Unnamed: 0,token,count,tf_norm,numberOfChap,idf,tf_idf
0,elizabeth,1,1.0,11,1.0,1.0


# 4. Vector Space Model ( Cosine Similarity )

## 4.1 Modeling (함수 구현)

In [7]:
# load tf_idf
def load_tf_idf_doc():
    # 문서의 tf_idf 로드
    tf_idf_docpath = "data/tf_idf.xlsx"
    tf_idf_doc = pd.read_excel(tf_idf_docpath)
    tf_idf_doc = tf_idf_doc.rename(columns={'tf_idf':'tf_idf_doc'}) #join 시에 이름이 겹치는 것을 방지
    
    return tf_idf_doc

In [8]:
# param: 쿼리의 tf_idf, 문서의 tf_idf
def get_dot(q1_tfidf, tf_idf_doc):
    
    # 주어진 쿼리의 tf idf 와 tf_idf_doc 사이의 cos similairity 계산
    # 문서에 있는 단어의 tf_idf 를 가져옴(left join)
    megedtfidf = pd.merge(q1_tfidf, tf_idf_doc, how='left', on=['token', 'token'])

    # Dot product(Query, Document) 
    # 문서와 쿼리의 단어들의 tf*idf 값을 곱하여 dot 을 생성해냄
    megedtfidf["dot"] = megedtfidf["tf_idf"]*megedtfidf["tf_idf_doc"]

    # 필요한 정보만 따로 빼낸다
    chapters = megedtfidf["chapter"]
    token = megedtfidf["token"]
    dot = megedtfidf["dot"]
    term_dotproducts = df(data = {'chapter':chapters, 'token':token, 'dot':dot})
    term_dotproducts

    # 문서별로 각 모든 단어의 dot 값들을 합산함 
    # Dot project 합산
    dot_sums = term_dotproducts.groupby(['chapter'], as_index=False)["dot"].sum()
    
    return dot_sums

In [9]:
# ||Query||
def get_abs_query(q1_tfidf):
    query_abs = np.sqrt(np.sum(q1_tfidf["tf_idf"]*q1_tfidf["tf_idf"]))
    return query_abs

In [10]:
# ||Document n||
# 각 문서의 tfidf 값들을 모두 제곱한 후에, 합산하여준다.
# param: 쿼리의 tf_idf, 문서의 tf_idf
def get_abs_documents(q1_tfidf, tf_idf_doc):
    
    # 문서의 tf_idf 를 제곱해준다.
    megedtfidf = pd.merge(q1_tfidf, tf_idf_doc, how='left', on=['token', 'token'])
    megedtfidf["doc_abs"] = megedtfidf["tf_idf_doc"]*megedtfidf["tf_idf_doc"]
    
    # 챕터별로 제곱해준 문서의 tf_idf 값을 합해준다.
    summation = megedtfidf.groupby(['chapter'], as_index=False)["doc_abs"].sum()

    # tfidf 합산 후 sqrt를 취함.
    # chapter 정보를 유지하기위해 조금 코드가 복잡해짐
    chapters = summation["chapter"]
    absolutes = np.sqrt(summation["doc_abs"])
    doc_abs = df(data = {'chapter':chapters, 'doc_abs':absolutes})
    
    return df(doc_abs)

## 4.2 Sending Queries (질의 날리기)

In [11]:
def get_related_chapaters(keywords): 
    tf_idf_doc = load_tf_idf_doc() # 문서의 각단어의 tfidf 획득
    q1_tfidf = get_quries_tfidf(keywords) # 질의문의 tfidf 값을 계산

    # 질의문 1과 각 문서사이의 모든 cos similarity 구함
    # dot_sums / ||Query|| * ||Document n|| 수행
    dots = get_dot(q1_tfidf, tf_idf_doc)
    doc_abs = get_abs_documents(q1_tfidf, tf_idf_doc)
    query_abs = get_abs_query(q1_tfidf)
    vectorspace = pd.merge(dots, doc_abs, how='left', on=['chapter', 'chapter'])
    vectorspace["query_abs"] = query_abs
    vectorspace["cos_similarity"] = vectorspace["dot"] / ( vectorspace["doc_abs"] * vectorspace["query_abs"])
    vectorspace = vectorspace.sort_values(by=['cos_similarity'], ascending=False)
    
    return vectorspace

In [12]:
q={}
v={}
a={}

## NDCG 방식으로 채점
### 1.	Where does Bennet family live in? (7)

In [115]:
# family 는 키워드에서 제외. 
# 딸들이 살고있다고 할수도 있고, Bennet 일가라고 할 수도 있고, 표현이 너무 다양해서.
q["result"+str(1)] = ["bennet", "lived", "village"]
v["result"+str(1)] = get_related_chapaters(q["result"+str(1)])
v["result"+str(1)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
3,5,0.008476,0.010851,1.317882,0.592732
1,3,0.01068,0.014558,1.317882,0.556664
5,7,0.006253,0.011224,1.317882,0.422775
0,2,0.007707,0.023121,1.317882,0.252931
4,6,0.000634,0.001903,1.317882,0.252931
6,9,0.004532,0.013597,1.317882,0.252931
7,11,0.001795,0.005384,1.317882,0.252931
8,13,0.003584,0.010753,1.317882,0.252931
9,15,0.003272,0.009816,1.317882,0.252931
10,17,0.001119,0.003356,1.317882,0.252931


## MAP 방식으로 채점>

### 2.	Who is considered to be the most beautiful among Mr. Bennet’s daughters? (5)

In [123]:
# 여성에게 쓰는 아름답다는 표현인 pretty를 선택
# 가장 아름다운 사람을 선택하므로 prettiest를 키워드로 선택
q["result"+str(2)] = ["most", "beautiful", "pretty", "prettiest"]
v["result"+str(2)] = get_related_chapaters(q["result"+str(2)])
v["result"+str(2)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
2,5.0,0.008283,0.008629,1.118899,0.857947
0,3.0,0.005303,0.007282,1.118899,0.650852
1,4.0,0.006215,0.009047,1.118899,0.613937
3,6.0,0.002779,0.004046,1.118899,0.613937
4,9.0,0.001977,0.004421,1.118899,0.399602


### 3.	Whom did Mr. Bingley dance with in the party? (5)

In [126]:
q["result"+str(3)] = ["mr", "bingley", "danced", "dance", "party"]
# 문제가 과거형이라 동사 과거형도 키워드로서 선정
v["result"+str(3)] = get_related_chapaters(q["result"+str(3)])
v["result"+str(3)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
1,3,0.026689,0.044076,0.709748,0.853167
3,5,0.010243,0.020047,0.709748,0.719892
4,6,0.008208,0.018012,0.709748,0.642068
7,11,0.008803,0.021864,0.709748,0.567298
0,2,0.012779,0.042879,0.709748,0.419916
2,4,0.00728,0.025095,0.709748,0.408748
8,13,0.005161,0.018093,0.709748,0.4019
6,9,0.006675,0.023616,0.709748,0.398239
10,17,0.010176,0.036335,0.709748,0.394593
5,7,0.004334,0.015997,0.709748,0.381705


### 4.	What is the relationship between Elizabeth and Lydia? (5)

In [131]:
# relation 은 chapter 7에서만 등장하므로 쓰면 안됨(잘못된 방향으로 인도함)
# relationship 은 chapter 15에서만 등장하므로 쓰면 안됨(잘못된 방향으로 인도함)
#q["result"+str(4)] = ["relationship"]
q["result"+str(4)] = ["Elizabeth", "Lydia"]
v["result"+str(4)] = get_related_chapaters(q["result"+str(4)])
v["result"+str(4)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
8,13,0.004178,0.004739,0.881513,1.0
9,15,0.006948,0.008137,0.881513,0.96866
0,2,0.007538,0.008877,0.881513,0.963377
5,7,0.014436,0.017865,0.881513,0.916703
6,9,0.00755,0.01052,0.881513,0.81418
1,3,0.003818,0.005319,0.881513,0.81418
10,17,0.010249,0.01428,0.881513,0.81418
7,11,0.008075,0.016151,0.881513,0.567207
2,4,0.003191,0.006383,0.881513,0.567207
3,5,0.001124,0.002247,0.881513,0.567207


### 5.	How many daughters Mr. Bennet has? (3)

In [138]:
q["result"+str(5)] = ["daughters", "Bennet"]
v["result"+str(5)] = get_related_chapaters(q["result"+str(5)])
v["result"+str(5)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
8,13,0.017512,0.018547,0.946001,0.998106
9,15,0.011238,0.012589,0.946001,0.943649
6,9,0.013176,0.015746,0.946001,0.884532
1,3,0.011712,0.015011,0.946001,0.82474
5,7,0.008212,0.011379,0.946001,0.762841
0,2,0.015289,0.023583,0.946001,0.685298
2,4,0.003191,0.006383,0.946001,0.528541
3,5,0.004494,0.008989,0.946001,0.528541
4,6,0.000951,0.001903,0.946001,0.528541
7,11,0.002692,0.005384,0.946001,0.528541


### 6.	Which chapter did the ladies go to Meryton? (3)

In [142]:
# go, went 모두 선택
q["result"+str(6)] = ["ladies", "go", "went", "Meryton"]
v["result"+str(6)] = get_related_chapaters(q["result"+str(6)])
v["result"+str(6)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
9,15.0,0.00398,0.00667,0.606611,0.983704
5,7.0,0.011261,0.019473,0.606611,0.953263
6,9.0,0.00219,0.003807,0.606611,0.948318
0,2.0,0.002731,0.005619,0.606611,0.801197
7,11.0,0.00214,0.004568,0.606611,0.772193
8,13.0,0.001673,0.003651,0.606611,0.755431
2,4.0,0.00176,0.00387,0.606611,0.749566
10,17.0,0.001891,0.00441,0.606611,0.706862
4,6.0,0.00179,0.004274,0.606611,0.690414
3,5.0,0.003553,0.009789,0.606611,0.5984


### 7.	Who does inherit the estate of Mr. Bennet? (3)

In [62]:
q["result"+str(7)] = ["inherit", "estate", "mr", "bennet"]
v["result"+str(7)] = get_related_chapaters(q["result"+str(7)])
v["result"+str(7)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
2,4.0,0.007126,0.01308,0.57001,0.955778
8,13.0,0.011356,0.022658,0.57001,0.879238
5,7.0,0.007096,0.017875,0.57001,0.696454
1,3.0,0.011937,0.033077,0.57001,0.633132
6,9.0,0.007417,0.02105,0.57001,0.618116
0,2.0,0.015173,0.044117,0.57001,0.603391
9,15.0,0.011411,0.03345,0.57001,0.598455
3,5.0,0.00618,0.018117,0.57001,0.598403
7,11.0,0.00572,0.018306,0.57001,0.548177
4,6.0,0.004044,0.014398,0.57001,0.492707


### 8.	What is the last name of Mr. Bingley’s married sister? (3)

In [157]:
q["result"+str(8)] = ["Miss", "bingley"]
v["result"+str(8)] = get_related_chapaters(q["result"+str(8)])
v["result"+str(8)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
5,7,0.010039,0.012877,0.781283,0.997908
9,15,0.005378,0.006948,0.781283,0.990751
10,17,0.004936,0.006448,0.781283,0.979804
7,11,0.018965,0.025055,0.781283,0.968848
4,6,0.012932,0.017325,0.781283,0.955375
2,4,0.01937,0.026663,0.781283,0.929865
6,9,0.013108,0.018829,0.781283,0.891007
1,3,0.01388,0.020196,0.781283,0.879667
3,5,0.012462,0.01902,0.781283,0.838627
0,2,0.010116,0.020231,0.781283,0.639973


### 9.	What is the nickname of Elizabeth? (5)

In [112]:
# Elizabeth 는 모든 챕터에서 등장: 의미없음. 나오기만 하면 cos sim이 1이 나오게됨
# nickname 이란 단어는 본문에서 존재하지 않음. cos sim이 계산되지 않음.
# 극중 Eliza에게 구혼하는 사람은 애칭으로 부를 것이라 보고 "Darcy"를 집어넣음
q["result"+str(9)] = ["Elizabeth", "mr", "Darcy"]
v["result"+str(9)] = get_related_chapaters(q["result"+str(9)])
v["result"+str(9)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
7,11,0.019624,0.031754,0.618386,0.999363
4,6,0.014647,0.023854,0.618386,0.992945
6,9,0.013405,0.022293,0.618386,0.972354
2,4,0.011099,0.018658,0.618386,0.961981
3,5,0.009232,0.017833,0.618386,0.837178
5,7,0.010271,0.02074,0.618386,0.800886
1,3,0.015871,0.032177,0.618386,0.797639
10,17,0.018638,0.038186,0.618386,0.789283
9,15,0.014448,0.032786,0.618386,0.71262
8,13,0.00672,0.017679,0.618386,0.614734


### 10.	Who is living at Netherfield? (5)

In [104]:
# Netherfield 는 모든 챕터에서 등장: 의미없음. 나오기만 하면 cos sim이 1이 나오게됨
# live, living, settled 가 특정 챕터에서만 등장, settle 은 등장 안함.
# 다른 키워드 선정이 필요했음. ~로부터 왔다 라는 의미로 came 을 선택
q["result"+str(10)] = ["Netherfield", "came"]
v["result"+str(10)] = get_related_chapaters(q["result"+str(10)])
v["result"+str(10)]

Unnamed: 0,chapter,dot,doc_abs,query_abs,cos_similarity
4,7,0.006535,0.006518,1.002657,1.0
2,5,0.004518,0.004506,1.002657,1.0
7,15,0.00405,0.004208,1.002657,0.959876
8,17,0.007002,0.008497,1.002657,0.821858
0,3,0.005216,0.00633,1.002657,0.821858
5,9,0.006049,0.007682,1.002657,0.785405
1,4,0.003067,0.005109,1.002657,0.598745
3,6,0.000686,0.001142,1.002657,0.598745
6,11,0.00097,0.001616,1.002657,0.598745


## Result
결과출력

In [23]:
for i in range(1, 11):
    # 문제번호
    v["result"+str(i)]["question_no"] = i
    
    # 순위매기기(index 재지정)
    v["result"+str(i)] = v["result"+str(i)].reset_index()

    # 기존의 index 열은 삭제
    v["result"+str(i)] = v["result"+str(i)].drop('index', 1)
    
    # 정답챕터
    v["result"+str(i)]["reference_chap"] = a["result"+str(i)][0]
    
    # 열의 순서를 바꿈
    v["result"+str(i)] = pd.DataFrame(v["result"+str(i)], 
                                      columns=['question_no','reference_chap', 'chapter','dot', 'doc_abs', 'query_abs', 'cos_similarity'])

v["result"+str(1)]

Unnamed: 0,question_no,reference_chap,chapter,dot,doc_abs,query_abs,cos_similarity
0,1,9,9,0.016874,0.016755,1.290379,0.780471
1,1,9,7,0.000962,0.002041,1.290379,0.36503
2,1,9,6,0.007308,0.017422,1.290379,0.325092
3,1,9,13,0.000708,0.001952,1.290379,0.28131
4,1,9,2,0.001523,0.004196,1.290379,0.28131
5,1,9,3,0.003812,0.010661,1.290379,0.277114
6,1,9,11,0.007015,0.021098,1.290379,0.257676
7,1,9,4,0.004601,0.015328,1.290379,0.23262
8,1,9,5,0.00243,0.008094,1.290379,0.23262
9,1,9,15,0.001327,0.00442,1.290379,0.23262


In [24]:
result = v["result"+str(1)]
for i in range(2, 11):
    result = result.append(v["result"+str(i)])

result

Unnamed: 0,question_no,reference_chap,chapter,dot,doc_abs,query_abs,cos_similarity
0,1,9,9.0,0.016874,0.016755,1.290379,0.780471
1,1,9,7.0,0.000962,0.002041,1.290379,0.365030
2,1,9,6.0,0.007308,0.017422,1.290379,0.325092
3,1,9,13.0,0.000708,0.001952,1.290379,0.281310
4,1,9,2.0,0.001523,0.004196,1.290379,0.281310
5,1,9,3.0,0.003812,0.010661,1.290379,0.277114
6,1,9,11.0,0.007015,0.021098,1.290379,0.257676
7,1,9,4.0,0.004601,0.015328,1.290379,0.232620
8,1,9,5.0,0.002430,0.008094,1.290379,0.232620
9,1,9,15.0,0.001327,0.004420,1.290379,0.232620


In [25]:
import pandas as pd
writer = pd.ExcelWriter('data/results_test.xlsx')
result.to_excel(writer,'Sheet1')
writer.save()