In [1]:
import pymongo
from pymongo import MongoClient
import pprint as pp
import pandas as pd

In [2]:
client = MongoClient('mongodb+srv://mongo:mongo@ngranback.bmasa.mongodb.net/myFirstDatabase?retryWrites=true&w=majority')

db = client.NLG_DB
query = db.ETL_collection
info_query = db.ETL_ep_info

In [3]:
query.find_one()

{'_id': ObjectId('60f0c99545668efa07177e56'),
 'Character': 'JERRY',
 'Dialogue': 'Do you know what this is all about? Do you know, why were here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about We should go out? This is what theyre talking about...this whole thing, were all out now, no one is home. Not one person here is home, were all out! There are people tryin to find us, they dont know where we are. (on an imaginary phone) Did you ring?, I cant find him. Where did he go? He didnt tell me where he was going. He must have gone out. You wanna go out you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then youre standing around, whatta you do? You go We gotta be getting back. Once youre out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right? Wh

In [4]:
# =======================================
# === Lines Per Season, per Character ===
# =======================================

lps_J, lps_G, lps_E, lps_K = [], [], [], []
lps_lists = [lps_J, lps_G, lps_E, lps_K]
chars = ['JERRY', 'GEORGE', 'ELAINE', 'KRAMER']

for i in range(4):
    for j in range(9):
        lps_lists[i].append(query.count_documents({"Character": chars[i], 'Season': j+1}))

# -----------------------------
# lps_X = list containing 9 entries, one for each main character
# each entry is the number of lines by that character in that season
    
print(lps_J)
print(lps_G)
print(lps_E)
print(lps_K)

[564, 1059, 2001, 2018, 1821, 1834, 1693, 1762, 1793]
[261, 642, 1463, 1401, 1207, 1087, 1208, 1031, 1248]
[158, 466, 1116, 805, 1088, 1099, 1062, 1016, 1069]
[67, 284, 719, 795, 872, 909, 909, 1012, 985]


In [6]:
# ===============================
# === Get list of top Writers ===
# ===============================

writers = query.distinct('Writers')
eps_written =[]


for k in range(52):
    eps_written.append(query.count_documents({"Writers": writers[k]})) 
    
writerDF = pd.DataFrame({
    'Writers':writers,
    'Eps_Written': eps_written
})

writerDF = writerDF.sort_values(['Eps_Written'], ascending=False)

topWriters = writerDF.iloc[0:6, :]['Writers'].to_list()
topWriterscount = writerDF.iloc[0:6, :]['Eps_Written'].to_list()

# Top Writer list is currently limited to 6
topWriters

['Larry David',
 'Peter Mehlman',
 'Larry Charles',
 'Larry David, Jerry Seinfeld',
 'Tom Gammill, Max Pross',
 'Alec Berg, Jeff Schaffer']

In [7]:
# =======================================
# === Lines Per Writer, per Character ===
# =======================================

lpw_J, lpw_G, lpw_E, lpw_K = [], [], [], []
lpw_lists = [lpw_J, lpw_G, lpw_E, lpw_K]

for i in range(len(chars)):
    for j in range(len(topWriters)):
        lpw_lists[i].append(query.count_documents({'Character': chars[i], "Writers": topWriters[j]}))

# TEMPORARY
# lpw_X = list with 1 entry per top writer (1 list per character)
# each entry contains total number of lines written for that character by that writer

print(lpw_lists)

[[2372, 1173, 1255, 1360, 742, 777], [1702, 780, 849, 722, 500, 501], [1326, 691, 512, 538, 464, 467], [1034, 462, 551, 326, 423, 386]]


In [8]:
# Adjust LPW data to average lines per episode

for i in range(4):
    for j in range(len(topWriterscount)):
        lpw_lists[i][j] = round(lpw_lists[i][j] / topWriterscount[j],2)

print(lpw_J)
print(lpw_G)
print(lpw_E)
print(lpw_K)

# 1 list per character
# lpw_X = list of 1 entry per writer
# each entry is the percentage of that writer's total lines that are for that character

[0.26, 0.27, 0.3, 0.34, 0.23, 0.27]
[0.19, 0.18, 0.2, 0.18, 0.16, 0.17]
[0.14, 0.16, 0.12, 0.14, 0.15, 0.16]
[0.11, 0.11, 0.13, 0.08, 0.13, 0.13]


In [15]:

# ==================================
# === Episode Ratings per Writer ===
# ==================================

writerRatings = [[],[],[],[],[],[]]

for j in range(len(topWriters)):
        for doc in info_query.find({"Writers": topWriters[j]}):
            writerRatings[j].append(doc['Rating'])

            
# writerRatings holds 6 lists
# 1 list per each of the 6 top Writers
# each list holds the rating of every episode written by that writer

# lists in writerRatings are NOT the same lengths!!! these are for box plots


In [11]:
# =====================================
# === Episode Ratings per Character ===
# =====================================

# each character needs a dictionary with one key per rating (( 7.0 --> 9.6 ))
# each key will have list with 1 entry per episode with that rating 
# each entry will contain the number of lines that character had in that episode

# will do this in 4 steps to avoid confusion


# all ratings (there are 27)
ratings = [x/10 for x in range(70,97)]

# rated_eps is a list with as many entries as there are ratings (27)
rated_eps=[]

#populate with an empty list for each rating
for n in range(len(ratings)):
    rated_eps.append([])

# create a list to hold the total number of episodes per rating (for later use if we want it)
episodes_per_rating =[]

# populate each rating entry with a list of SEIDs with that rating
for i in range(len(ratings)):
    
    # populate list of total eps per rating
    num_eps = info_query.count_documents({"Rating": ratings[i]})
    episodes_per_rating.append(num_eps)

    # collect SEIDs with this rating
    for doc in info_query.find({"Rating": ratings[i]}):
        rated_eps[i].append(doc['SEID'])

# ratings = list of 27 possible ratings
# rated_eps = list of 27 lists
# rated_eps[i] = list of SEIDs with rating equal to ratings[i]



In [12]:

# ---TO DO---
# need to collect number of lines per character for each SEID in list
# then repeat that for every list in rated_eps

# should be separated by character, not by rating, in order to build one data set per character

In [16]:
# ====================================
# === Episode Ratings per Air Date ===
# ====================================


# Need to figure out how to read in the dates