In [219]:
import pandas as pd
import numpy as np

In [220]:
FIRST_NAMES = [
    "Andrei", "Maria", "Alexandru", "Ioana", "Mihai", "Elena", "Daniel", 
    "Gabriela", "Cristian", "Ana", "Florin", "Diana", "Sorin", "Roxana", 
    "Vasile", "Adriana", "Cătălin", "Raluca", "Ionuț", "Monica", 
    "George", "Mădălina", "Radu", "Oana", "Cosmin", "Simona", "Nicu", 
    "Camelia", "Lucian", "Alina"
]

LAST_NAMES = [
    "Bobescu", "Ionescu", "Dumitrescu", "Stan", "Constantinescu", "Gheorghiu", 
    "Marinescu", "Radu", "Tudor", "Diaconu", "Mihailescu", "Preda", "Vasilescu", 
    "Enache", "Petrescu", "Nistor", "Sima", "Voinea", "Iancu", "Filip", 
    "Pavel", "Dobre", "Ciobanu", "Popa", "Bălan", "Stanciu", "Lupu", "Sârbu", 
    "Matei", "Georgescu"
]


In [221]:
NUM_USERS = 1000
PROFESSOR_RATE = 0.7
STUDENT_RATE = 0.3

In [222]:
data = pd.read_csv('../scraping/results.csv')

In [223]:
data['student_count'] = data['student_count'] + 10
data['professor_count'] = data['professor_count'].clip(10)

In [224]:
data['student_rate'] = data['student_count'] / data['student_count'].sum()
data['professor_rate'] = data['professor_count'] / data['professor_count'].sum()

In [225]:
users = pd.DataFrame()
users['user_id'] = range(NUM_USERS)
users['is_professor'] = np.random.choice([False, True], p = [1 - PROFESSOR_RATE, PROFESSOR_RATE], size = NUM_USERS)
users['is_student'] = np.random.choice([False, True], p = [1 - STUDENT_RATE, STUDENT_RATE], size = NUM_USERS)
users['first_name'] = np.random.choice(FIRST_NAMES, size = NUM_USERS)
users['last_name'] = np.random.choice(LAST_NAMES, size = NUM_USERS)
users['rating_avg'] = np.random.normal(loc = 2.5, scale = 1, size = NUM_USERS).round(1).clip(min = 0, max = 5)
users['email'] = users['last_name'].str.lower() + '.' + users['first_name'].str.lower() + "@gmail.rom"

In [226]:
num_professors = users['is_professor'].sum()
num_students = users['is_student'].sum()

In [227]:
users.loc[users['is_professor'] == True, 'teaching_subject'] = np.random.choice(data['subject_eng'].values, num_professors, p = data['professor_rate'])

users.loc[users['is_student'] == True, 'learning_subject'] = np.random.choice(data['subject_eng'].values, num_students, p = data['student_rate'])


users = users[users['teaching_subject'] != users['learning_subject']]

In [228]:
users = users.dropna(subset = ['teaching_subject', 'learning_subject'], how = 'all')
users = users.drop(columns = ['is_professor', 'is_student'])

In [229]:
users

Unnamed: 0,user_id,first_name,last_name,rating_avg,email,teaching_subject,learning_subject
3,3,Cosmin,Ionescu,2.0,ionescu.cosmin@gmail.rom,Political science,Psychology
4,4,Camelia,Bobescu,0.3,bobescu.camelia@gmail.rom,Romanian language,
5,5,Radu,Marinescu,2.7,marinescu.radu@gmail.rom,,Geography
7,7,Alina,Tudor,3.2,tudor.alina@gmail.rom,English language,
9,9,Florin,Sima,1.8,sima.florin@gmail.rom,Mathematics,
...,...,...,...,...,...,...,...
993,993,Florin,Radu,2.0,radu.florin@gmail.rom,Mathematics,
994,994,Ionuț,Dobre,4.2,dobre.ionuț@gmail.rom,English language,
995,995,Roxana,Matei,2.6,matei.roxana@gmail.rom,Physics,
996,996,Mihai,Stan,3.4,stan.mihai@gmail.rom,,Romanian language


In [235]:
users.to_csv('users.csv', index = False)

In [236]:
edges = pd.merge(users, users, left_on='teaching_subject', right_on='learning_subject', suffixes=('_teacher', '_student'))
edges = edges.dropna(subset=['teaching_subject_teacher', 'learning_subject_student'])
edges = edges[['user_id_teacher', 'user_id_student', 'teaching_subject_teacher']]
edges = edges.rename(columns = {'teaching_subject_teacher' : 'teaching'})

In [237]:
edges.to_csv('teacher_student_connections.csv', index = False)