In [307]:
import pandas as pd
import numpy as np

In [308]:
FIRST_NAMES = [
    "Andrei", "Maria", "Alexandru", "Ioana", "Mihai", "Elena", "Daniel",
    "Gabriela", "Cristian", "Ana", "Florin", "Diana", "Sorin", "Roxana",
    "Vasile", "Adriana", "Cătălin", "Raluca", "Ionuț", "Monica",
    "George", "Mădălina", "Radu", "Oana", "Cosmin", "Simona", "Nicu",
    "Camelia", "Lucian", "Alina"
]

LAST_NAMES = [
    "Bobescu", "Ionescu", "Dumitrescu", "Stan", "Constantinescu", "Gheorghiu",
    "Marinescu", "Radu", "Tudor", "Diaconu", "Mihailescu", "Preda", "Vasilescu",
    "Enache", "Petrescu", "Nistor", "Sima", "Voinea", "Iancu", "Filip",
    "Pavel", "Dobre", "Ciobanu", "Popa", "Bălan", "Stanciu", "Lupu", "Sârbu",
    "Matei", "Georgescu"
]


In [309]:
NUM_USERS = 1000
PROFESSOR_RATE = 0.7
STUDENT_RATE = 0.3

In [310]:
data = pd.read_csv('../resources/scraping/results.csv')

In [311]:
data['student_count'] = data['student_count'] + 10
data['professor_count'] = data['professor_count'].clip(10)

In [312]:
data['student_rate'] = data['student_count'] / data['student_count'].sum()
data['professor_rate'] = data['professor_count'] / data['professor_count'].sum()

In [327]:
users = pd.DataFrame()
users['user_id'] = range(NUM_USERS)
users['is_professor'] = np.random.choice([False, True], p=[1 - PROFESSOR_RATE, PROFESSOR_RATE], size=NUM_USERS)
users['is_student'] = np.random.choice([False, True], p=[1 - STUDENT_RATE, STUDENT_RATE], size=NUM_USERS)
users['first_name'] = np.random.choice(FIRST_NAMES, size=NUM_USERS)
users['last_name'] = np.random.choice(LAST_NAMES, size=NUM_USERS)
users['rating_avg'] = np.random.normal(loc=3, scale=1, size=NUM_USERS).round(1).clip(min=1, max=5)
users['email'] = users['last_name'].str.lower() + (pd.Series(['.'] * NUM_USERS)) + users['first_name'].str.lower() + np.random.randint(1, 100, NUM_USERS).astype(str) + (pd.Series(['@gmail.com'] * NUM_USERS))
users = users.drop_duplicates(subset=['email'])

In [328]:
users

Unnamed: 0,user_id,is_professor,is_student,first_name,last_name,rating_avg,email
0,0,True,False,Simona,Filip,4.8,filip.simona16@gmail.com
1,1,True,False,Ana,Radu,2.6,radu.ana3@gmail.com
2,2,False,True,Cosmin,Constantinescu,2.1,constantinescu.cosmin91@gmail.com
3,3,False,False,Monica,Constantinescu,3.8,constantinescu.monica52@gmail.com
4,4,True,False,Ana,Enache,3.7,enache.ana59@gmail.com
...,...,...,...,...,...,...,...
994,994,True,True,Ioana,Mihailescu,5.0,mihailescu.ioana5@gmail.com
995,995,False,True,Mădălina,Ionescu,1.4,ionescu.mădălina47@gmail.com
997,997,False,False,Cosmin,Preda,4.3,preda.cosmin91@gmail.com
998,998,True,False,Diana,Nistor,2.2,nistor.diana29@gmail.com


In [329]:
num_professors = users['is_professor'].sum()
num_students = users['is_student'].sum()

In [330]:
users.loc[users['is_professor'] == True, 'teaching_subject'] = np.random.choice(data['subject_eng'].values, num_professors, p = data['professor_rate'])

users.loc[users['is_student'] == True, 'learning_subject'] = np.random.choice(data['subject_eng'].values, num_students, p = data['student_rate'])


users = users[users['teaching_subject'] != users['learning_subject']]
users = users.dropna(subset = ['teaching_subject', 'learning_subject'], how = 'all')


In [331]:
users.to_csv('../resources/userbase/users.csv', index = False)

In [332]:
users = users.drop(columns = ['is_professor', 'is_student'])

In [333]:
edges = pd.merge(users, users, left_on='teaching_subject', right_on='learning_subject', suffixes=('_teacher', '_student'))
edges = edges.dropna(subset=['teaching_subject_teacher', 'learning_subject_student'])
edges = edges[['user_id_teacher', 'user_id_student', 'teaching_subject_teacher', 'rating_avg_teacher']]
edges = edges.rename(columns = {'teaching_subject_teacher' : 'subject'})

In [334]:
edges.to_csv('../resources/userbase/teacher_student_connections.csv', index = False)