In [1]:
from pymongo import MongoClient
import json
import re
import pandas as pd

In [2]:
# connect to mongodb
client = MongoClient('localhost', 27017)
db = client['Project2']

# Import all data

In [3]:
games = pd.json_normalize(db.Steam.find())

In [4]:
len(games)

5209

In [5]:
reviewsPc = pd.json_normalize(db.metacriticPc.find())

In [6]:
len(reviewsPc)

78130

In [7]:
reviewsAll = pd.json_normalize(db.metacriticAll.find())

In [8]:
len(reviewsAll)

26432

# First merge
**games** with **metacriticPC** on *normTitle*

In [9]:
merged = pd.merge(games, reviewsPc[['normTitle','score']], on='normTitle', how='left')

In [10]:
len(merged)

5209

# Second merge
**games** with **metacriticPC** on *normTitle2*

In [11]:
merged = pd.merge(merged, reviewsPc[['normTitle','score']], left_on='normTitle2', right_on='normTitle', how='left')

In [12]:
merged = merged.drop('normTitle_y', axis=1)

In [13]:
merged = merged.rename({'normTitle_x': 'normTitle', 'score_x':'scorePc1', 'score_y': 'scorePc2'}, axis='columns')

# Third merge
**games** with **metacriticALL** on *normTitle*

In [15]:
merged = pd.merge(merged, reviewsAll[['normTitle','score']], on = 'normTitle', how='left')

In [16]:
merged = merged.rename({'score': 'scoreAll1'}, axis='columns')

In [17]:
len(merged)

5209

# Fourth merge
**games** with **metacriticALL* on *normTitle2*

In [18]:
merged = pd.merge(merged, reviewsAll[['normTitle','score']], left_on = 'normTitle2', right_on='normTitle', how='left')

In [19]:
merged = merged.drop('normTitle_y', axis=1)

In [20]:
merged = merged.rename({'normTitle_x': 'normTitle', 'score': 'scoreAll2'}, axis='columns')

In [21]:
len(merged)

5209

# Review

## First Merge

With the first merge, we merged **4275** games

In [22]:
subM = merged[merged['scorePc1'].notnull()]
len(subM)

4275

Of this 4275, **2744** have a score

In [23]:
ss = pd.concat([merged[merged['scorePc1'] == 'tbd'], merged[merged['scorePc1'].isnull()]])

In [24]:
5209-len(ss)

2744

## Second Merge

Using *normTitle2*, we merged **50** games

In [25]:
set2 = set(merged[merged['scorePc2'].notnull()]['_id'])
len(set2)

50

Of this 50, **21** are new games

In [26]:
set1 = set(subM['_id'])
setD =set2 - set1
len(setD)

21

Thanks to the second merge, we have **31** new games with a review score

In [27]:
ss = pd.concat([ss[ss['scorePc2'] == 'tbd'], ss[ss['scorePc2'].isnull()]])

In [28]:
5209 - len(ss) - 2744

31

## Third Merge

Merging with the reviews taken from all the other platform, we merged **2161** games

In [29]:
set3 = set(merged[merged['scoreAll1'].notnull()]['_id'])
len(set3)

2161

Of this 2161, **93** are new games

In [30]:
set12 = set1.union(set2)

In [31]:
len(set3 - set12)

93

Thanks to the third merge, we have **334** new games with a review score

In [32]:
ss = pd.concat([ss[ss['scoreAll1'] == 'tbd'], ss[ss['scoreAll1'].isnull()]])

In [33]:
5209 - len(ss) - 2744 - 31

334

## Fourth Merge

Merging with the reviews taken from all the other platform and using *normTitle2*, we merged **32** games

In [34]:
set4 = set(merged[merged['scoreAll2'].notnull()]['_id'])
len(set4)

32

Of this 32, **3** are new games

In [35]:
set123 = set12.union(set3)
len(set4 - set123)

3

Thanks to the fourth merge, we have **4** new games with a review score.

In [36]:
ss = pd.concat([ss[ss['scoreAll2'] == 'tbd'], ss[ss['scoreAll2'].isnull()]])

In [37]:
5209 - len(ss) - 2744 - 31 - 334

4

## In Conclusion

In [38]:
# turn dataframe into dictionary
mDict = merged.to_dict(orient='records')

In [39]:
pd.notnull(mDict[0]['scorePc1'])

True

In [40]:
for game in mDict:
    for col in ['scorePc1', 'scorePc2', 'scoreAll1', 'scoreAll2']:
        if pd.notnull(game[col]):
            game['merged'] = True
            break
    else:
        game['merged'] = False

In [41]:
for game in mDict:
    for col in ['scorePc2', 'scoreAll1', 'scoreAll2']:
        if pd.isnull(game['scorePc1']) or game['scorePc1'] == 'tbd':
            game['scorePc1'] = game[col]

In [47]:
final = pd.json_normalize(mDict).drop(['scorePc2', 'scoreAll1', 'scoreAll2'], axis= 1)

In [48]:
final = final[final['merged'] == True]

In [49]:
final = final.drop('merged', axis=1)

We have been able to merge **4392** games

In [50]:
len(final)

4392

Of this 4392 games, **3113** have a review on metacritic

In [51]:
final = final[pd.notnull(final['scorePc1'])]

In [52]:
len(final)

3113

# Upload to MongoDB

In [114]:
# create new collection
GameRev = db.GameReviews

In [113]:
final.head()

Unnamed: 0,_id,rank,SN,title,relDate,price,posPercRec,totRec,posPercTot,totTot,genres,normTitle,normTitle2,scorePc1
0,63a479b667c4b69609bddc1f,1,1938090,Call of Duty®: Modern Warfare® II,2022-10-27,"69,99€",66%,57500,61%,152177,"[Sparatutto in prima persona, Azione, Sparatut...",call of duty modern warfare 2,,86
1,63a479b667c4b69609bddc20,2,730,Counter-Strike: Global Offensive,2012-08-21,Free-to-Play,89%,65349,88%,6811463,"[Sparatutto in prima persona, Sparatutto, Mult...",counterstrike global offensive,,83
2,63a479b667c4b69609bddc21,3,1811260,EA SPORTS™ FIFA 23,2022-09-29,"69,99€",54%,16342,48%,38565,"[Sport, Calcio, Simulatori immersivi, eSport, ...",fifa 23,,77
3,63a479b667c4b69609bddc22,4,570,Dota 2,2013-07-09,Free-to-Play,81%,34937,82%,1897116,"[Free-to-Play, MOBA, Multigiocatore, Strategia...",dota 2,,90
4,63a479b667c4b69609bddc23,5,1172470,Apex Legends™,2020-11-04,Free-to-Play,72%,22042,84%,552941,"[Free-to-Play, Multigiocatore, Battle Royale, ...",apex legends,,88


In [115]:
#turn dataframe to dictionary
finalDict = final.to_dict(orient='records')

In [117]:
# blend together normTitle and normTitle2
for game in finalDict:
    if pd.isnull(game['normTitle']):
        game['normTitle'] = game['normTitle2']
    game.pop('normTitle2')

In [119]:
# insert documents into the collection
GameRev.insert_many(finalDict)

<pymongo.results.InsertManyResult at 0x19850e0cd30>