In [14]:
#### 라이브러리 import

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [16]:
art_info = pd.read_csv("article_info.csv")
art_info.head()

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
0,ARTICLE_0000,19 Tips For Everyday Git Use,I've been using git full time for the past 4 y...,HTML,en,USER_0683,,
1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,Intel has acquired computer vision and machine...,HTML,en,USER_1129,,
2,ARTICLE_0002,Practical End-to-End Testing with Protractor,One of the reasons AngularJS is so great to wo...,HTML,en,USER_0256,,
3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,Despite recent positive news and a renewed int...,HTML,en,USER_1304,,
4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,"Last year around this time, I wrote that The B...",HTML,en,USER_0336,,


In [17]:
len(art_info)

3008

In [18]:
art_info.isnull().sum()

articleID         0
Title             0
Content           0
Format            0
Language          0
userID            0
userCountry    2349
userRegion     2351
dtype: int64

In [19]:
view_log = pd.read_csv("view_log.csv")
view_log.head()

Unnamed: 0,userID,articleID,userRegion,userCountry
0,USER_0000,ARTICLE_0661,NY,US
1,USER_0000,ARTICLE_2316,NY,US
2,USER_0000,ARTICLE_1345,NY,US
3,USER_0000,ARTICLE_1089,NY,US
4,USER_0000,ARTICLE_1484,NY,US


# article_info의 열별 정보
- Format (HTML, VIDEO, RICH(서식이 있는 텍스트 포맷))
- Language (en(영어), pt(포르투갈어), la(라틴어), es(스페인어), ja(일본어))
- userCountry (BR(브라질), US(미국), CA(캐나다), AU(오스트레일리아), PT(포르투갈))
- userRegion (SP(브라질 상파울로), 

In [21]:
art_info['Format'].value_counts()

Format
HTML     2988
VIDEO      10
RICH       10
Name: count, dtype: int64

In [22]:
art_info['Language'].value_counts()

Language
en    2176
pt     825
la       3
es       2
ja       2
Name: count, dtype: int64

In [23]:
art_info['userCountry'].value_counts()

userCountry
BR    595
US     56
CA      5
AU      2
PT      1
Name: count, dtype: int64

In [24]:
art_info['userRegion'].value_counts()

userRegion
SP     522
MG      68
NY      18
NJ      16
GA       7
IL       6
RJ       4
ON       4
NSW      2
SC       2
TX       2
RI       1
WA       1
FL       1
MS       1
CA       1
AB       1
Name: count, dtype: int64

In [25]:
art_info[art_info['Language'] == 'ja']

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
1388,ARTICLE_1388,"The Algorithm March, Japan's Strangely Enterta...",Arugorizumu Koushin! アルゴリズムこうしん (Algorithm Mar...,HTML,ja,USER_0368,,
2438,ARTICLE_2438,40 Basic Japanese conversations,Japanese conversation using Ninja LINE sticker...,HTML,ja,USER_0368,,


In [26]:
art_info[art_info['Language'] == 'es']

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
524,ARTICLE_0524,Costa Rica presenta su primer edificio constru...,"San José, 22 abr (EFE).- Autoridades de Costa ...",HTML,es,USER_0704,,
988,ARTICLE_0988,La RAE lucha contra los anglicismos con una ca...,La Real Academia Española alerta sobre su abus...,HTML,es,USER_1166,,


In [27]:
art_info[art_info['Language'] == 'la']

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
756,ARTICLE_0756,Request lesson : How and when to use はず(=hazu)...,= Kotoshi no aki made niwa kare ga dekiru hazu...,HTML,la,USER_0617,,
808,ARTICLE_0808,git flow with support,"Lorem ipsum dolor sit amet, consectetur adipis...",HTML,la,USER_1244,,
955,ARTICLE_0955,"Within a Decade, Retail Banks will be Dead","A соuрlе of weeks ago, I found myself sitting ...",HTML,la,USER_1304,,


In [28]:
art_info[art_info['userCountry'] == 'PT']

Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion
2390,ARTICLE_2390,The new 64-bit Orange Pi is a quad-core comput...,Need a teeny tiny computer that can run Androi...,HTML,en,USER_1304,PT,


In [29]:
art_info = art_info.fillna('')

In [30]:
# Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')
# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(art_info['Content'])
# Print the shape of the tfidf_matrix
tfidf_matrix.shape

(3008, 72276)

In [31]:
# Compute the cosine similarity between each movie description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [32]:
indices = pd.Series(art_info.index, index=art_info['Title']).drop_duplicates()

In [33]:
def get_recommendations(title, cosine_sim=cosine_sim, num_recommend = 10):
    idx = indices[title]
# Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
    top_similar = sim_scores[1:num_recommend+1]
# Get the movie indices
    movie_indices = [i[0] for i in top_similar]
# Return the top 10 most similar movies
    return art_info['Title'].iloc[movie_indices]

In [34]:
get_recommendations('git flow with support', num_recommend = 5)

2465    Ranking 50 Maiores Empresas do E-commerce Bras...
771     [Free Online Course] Sit Less, Get Active - Th...
2252                             Type-based aliasing in C
1571    Liberty vai adotar telemetria e baratear segur...
1595    15 Awesome Sublime Text Plugins For Web Develo...
Name: Title, dtype: object

In [35]:
sample = pd.read_csv('sample_submission.csv')

In [36]:
sample.head()

Unnamed: 0,userID,articleID
0,USER_0000,ARTICLE_0000
1,USER_0000,ARTICLE_0001
2,USER_0000,ARTICLE_0002
3,USER_0000,ARTICLE_0003
4,USER_0000,ARTICLE_0004


In [37]:
# userID의 

In [38]:
for i in range(len(sample) // 5):
    sample['userID'][i * 5]

In [39]:
# x.to_csv('submission.csv', index = False)

NameError: name 'x' is not defined

#### 각 회원들이 본 기사를 배열 형식으로 변경하기

In [59]:
result = view_log.groupby('userID')['articleID'].agg(lambda x: list(x)).reset_index()
result.head()

Unnamed: 0,userID,articleID
0,USER_0000,"[ARTICLE_0661, ARTICLE_2316, ARTICLE_1345, ART..."
1,USER_0001,"[ARTICLE_1408, ARTICLE_0202, ARTICLE_2406, ART..."
2,USER_0002,"[ARTICLE_2504, ARTICLE_2803, ARTICLE_1910, ART..."
3,USER_0003,"[ARTICLE_2115, ARTICLE_1092, ARTICLE_0958, ART..."
4,USER_0004,"[ARTICLE_0173, ARTICLE_2259, ARTICLE_0690, ART..."


#### 기사 제목 ID와 실제 제목 합치기

In [60]:
result2 = pd.merge(view_log, art_info)
result2.head()

Unnamed: 0,userID,articleID,userRegion,userCountry,Title,Content,Format,Language
0,USER_0014,ARTICLE_1890,MG,BR,Stop Trying to Sound Smart When You're Writing,"When I read a piece of business writing, wheth...",HTML,en
1,USER_0014,ARTICLE_2789,MG,BR,Rating the English Proficiency of Countries an...,"In the aviation industry, miscommunication can...",HTML,en
2,USER_0014,ARTICLE_2789,MG,BR,Rating the English Proficiency of Countries an...,"In the aviation industry, miscommunication can...",HTML,en
3,USER_0014,ARTICLE_2789,MG,BR,Rating the English Proficiency of Countries an...,"In the aviation industry, miscommunication can...",HTML,en
4,USER_0014,ARTICLE_1890,MG,BR,Stop Trying to Sound Smart When You're Writing,"When I read a piece of business writing, wheth...",HTML,en


In [61]:
result3 = result2.groupby('userID')['articleID'].agg(lambda x: list(x)).reset_index()
result3.head()

Unnamed: 0,userID,articleID
0,USER_0014,"[ARTICLE_1890, ARTICLE_2789, ARTICLE_2789, ART..."
1,USER_0017,"[ARTICLE_0059, ARTICLE_0130, ARTICLE_2074, ART..."
2,USER_0021,"[ARTICLE_2911, ARTICLE_1743, ARTICLE_2911, ART..."
3,USER_0026,"[ARTICLE_0193, ARTICLE_0220, ARTICLE_0220, ART..."
4,USER_0047,"[ARTICLE_1917, ARTICLE_1028, ARTICLE_1028, ART..."


In [62]:
result4 = result2.groupby('userID')['Title'].agg(lambda x: list(x)).reset_index()
result4.head()

Unnamed: 0,userID,Title
0,USER_0014,[Stop Trying to Sound Smart When You're Writin...
1,USER_0017,"[All Together Now. Introducing G Suite., New E..."
2,USER_0021,[Plano nacional de ''internet das coisas'' sai...
3,USER_0026,"[Depois da Inteligência Artificial, vem aí a I..."
4,USER_0047,[A Data Engineer's Guide To Non-Traditional Da...


In [63]:
result2['Content'][0]

'When I read a piece of business writing, whether it\'s a proposal, a report, or a simple email, I\'m turned off by people who have invested more energy trying to sound smart than in trying to be smart. Ideally, I\'d like to read communications where I don\'t notice the writing at all. The best writing is so transparent that it doesn\'t obscure the underlying message. You can achieve that in your writing by investing in great content and then stripping away anything that detracts from it. How do you make your content great? Before crafting a single sentence, you determine the purpose and desired outcome of your communication. You go beyond the facts and information you\'re transmitting and push yourself to clarify what you want your audience to think, to feel, and to do after they\'ve read your message. For example, saying that you\'re 60% of the way to your annual target might leave one person thinking that you are progressing well and another thinking that the 40% gap is too large to

## 제목 기준, userRegion, userCountry, Title, Content, Format, Language를 기준으로 기사 추천해주기...