In [None]:
%pip install openpyxl
%pip install openai

In [None]:
import openai
from openai import OpenAI
from time import sleep
import os
import collections
import pandas as pd
import sys
import time
import re
import numpy as np

OPENAI_API_KEY = ''
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
def get_prompt(question):
  """
  Consists prompt and formats it to fit chatbot and golden answers in this iteration

  Inputs:
    A question

  Returns: formatted prompt
  """
  TASK_PROMPT = """
  You are an intelligent assistant that can determine if a given question is related to a particular topic.
  For each of the following topics, return a 1 if the given question is related to the topic, and 0 if the question is not related to the topic.

  The question is: {question}

  The topics:
  1. High Frequency Trading
  2. Quantitative Finance
  3. Computational Finance
  4. Drug Discovery
  5. Cancer
  6. Biology
  7. Radiology
  8. Neuroscience
  9. Evolution
  10. Protein Translation
  11. Ecology
  12. Cellular Biology
  13. Infectious Diseases
  14. Medicine
  15. Sociology
  16. Psychology
  17. Social Science
  18. Genomics
  19. Natural Languages
  20. Software Engineering
  21. Robotics
  22. Data Science
  23. Artificial Intelligence
  24. Computer Vision
  25. Internet of Things
  26. Human Computer Interaction
  27. Mechanical Engineering
  28. Blockchain
  29. Political Science
  30. Economics
  31. Deep Learning
  32. Time Series
  33. Machine Learning
  34. Computer Science
  35. Computer Graphics
  36. Math
  37. Computer Architecture
  38. Parallel Programming
  39. Logic
  40. Cybersecurity
  41. Privacy
  42. Cryptography
  43. Data Management
  44. Hardware
  45. Databases
  46. Prompting
  47. Large Language Models
  48. Query Optimization
  49. Reinforcement Learning
  50. Graphical Neural Networks
  51. Web Internet
  52. Autonomous Systems
  53. Electrical Engineering
  54. Attention in Natural Language Processing
  55. Bayesion Machine Learning
  56. Climate Environment
  57. Big Data
  58. Algorithms
  59. Probabilistic Algorithms
  60. Learning Theory
  62. Programming Languages
  63. Statistics
  64. Systems
  65. Quantum Computing
  66. Combinatorics
  67. Control Theory
  68. Ethics
  69. Information Theory
  70. Computer Science Education
  71. Data Mining
  72. Bias
  73. Convex Optimization
  74. Linear Algebra
  75. Geometry
  76. Compilers
  77. Wireless Technology
  78. Concept Bottleneck Model
  79. 3D Reconstruction
  80. Alzheimer's Disease
  81. Transfer Learning
  82. Real Time Systems
  83. Data Visualization
  84. Physics
  85. Cyber Physical Systems
  86. Physical Substrates
  87. Game Theory
  88. Antitrust
  89. Runtime
  90. Chatbots
  91. Law
  92. Networking
  93. Cloud Computing


  """
  prompt = TASK_PROMPT.replace("{question}", question)
  return prompt


In [None]:
def model(question):
  """
  Calls our GPT-4 model and returns its response

  Inputs:
        question

  Returns: 0 or 1 for each topic.
  """
  client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
  prompt = get_prompt(question)
  chat_completion = client.chat.completions.create(
                      messages=[{"role": "user", "content": prompt}],
                      model="gpt-4-0613")
  response = dict(dict(dict(chat_completion)['choices'][0])['message'])['content']
  return response

In [None]:
def read_file(file):
  """
  Reads the inputted file

  Input:
        file - xlsx file with 'Test/Dev Input' columns

  Output: questions list of strings
  """
  questions = pd.read_excel(file)['Test/Dev Input'].tolist()
  return questions

In [None]:
def process_string(input_str):
    # Split the string into items
    items = input_str.split("  ")
    values = []
    for item in items:
        # Split each item into key and value
        key_value = item.split(" - ")
        if len(key_value) == 2:
            # Only append the value (converted to an integer)
            values.append(int(key_value[1]))
    return values

In [None]:
cols_to_add = ['question',
'high_frequency_trading_yn',
'quantitative_finance_yn',
'computational_finance_yn',
'drug_discovery_yn',
'cancer_yn',
'biology_yn',
'radiology_yn',
'neuroscience_yn',
'evolution_yn',
'protein_translation_yn',
'ecology_yn',
'cellular_bio_yn',
'infectious_diseases_yn',
'medicine_yn',
'sociology_yn',
'psychology_yn',
'social_science_yn',
'genomics_yn',
'natural_languages_yn',
'software_engineering_yn',
'robotics_yn',
'data_science_yn',
'artificial_intelligence_yn',
'computer_vision_yn',
'internet_of_things_yn',
'human_computer_interaction_yn',
'mechanical_engineering_yn',
'blockchain_yn',
'political_science_yn',
'economics_yn',
'deep_learning_yn',
'time_series_yn',
'machine_learning_yn',
'computer_science_yn',
'computer_graphics_yn',
'math_yn',
'computer_architecture_yn',
'parallel_prgramming_yn',
'logic_yn',
'cybersecurity_yn',
'privacy_yn',
'cryptography_yn',
'data_management_yn',
'hardware_yn',
'databases_yn',
'prompting_yn',
'large_language_model_yn',
'query_optimization_yn',
'reinforcement_learning_yn',
'graph_neural_network_yn',
'web_internet_yn',
'autonomous_systems_yn',
'electrical_engineering_yn',
'attention_yn',
'bayesian_ml_yn',
'climate_environment_yn',
'big_data_yn',
'algorithms_yn',
'probabilistic_algos_yn',
'learning_theory_yn',
'programming_languages_yn',
'statistics_yn',
'systems_yn',
'quantum_computing_yn',
'combinatorics_yn',
'control_theory_yn',
'ethics_yn',
'information_theory_yn',
'computer_science_education_yn',
'data_mining_yn',
'bias_yn',
'convex_optimization_yn',
'linear_algebra_yn',
'geometry_yn',
'compilers_yn',
'wireless_tech_yn',
'concept_bottleneck_model_yn',
'3d_reconstruction_yn',
'alzheimers_disease_yn',
'transfer_learning_yn',
'real_time_systems_yn',
'data_visualization_yn',
'physics_yn',
'cyber_physical_systems_yn',
'physical_substrates_yn',
'game_theory_yn',
'antitrust_yn',
'runtime_yn',
'chatbot_yn',
'law_yn',
'networking_yn',
'cloud_computing_yn']

df = pd.DataFrame(columns=cols_to_add)

In [None]:
excel = pd.read_excel('530_project_test_dev.xlsx')

In [None]:
excel.head(2)

Unnamed: 0,Test/Dev Input,Expected Response,Name (if applicable)
0,What is my favorite ice cream?,I do not know the answer to that,
1,I want to do a capstone on the historical infl...,I am only able to recommend professors that wo...,


In [None]:
df_prof = pd.read_csv('data_professors.csv')

In [None]:
num_cols = len(df_prof.columns) - 1

In [None]:
%%time
count = 0
for question in excel['Test/Dev Input'].tolist():
  count += 1
  print('count', count)
  output = model(question)
  print(output)
  try:
    s = output.split('\n')
    counter = 0
    items = []
    for item in s:
      items.append(int(item[-1]))
    df.loc[len(df.index)] = [question] + items
  except:
    df.loc[len(df.index)] = [question] + num_cols*[0]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
53. Electrical Engineering: 0
54. Attention in Natural Language Processing: 1
55. Bayesion Machine Learning: 0
56. Climate Environment: 0
57. Big Data: 0
58. Algorithms: 0
59. Probabilistic Algorithms: 0
60. Learning Theory: 0
62. Programming Languages: 0
63. Statistics: 0
64. Systems: 0
65. Quantum Computing: 0
66. Combinatorics: 0
67. Control Theory: 0
68. Ethics: 0
69. Information Theory: 0
70. Computer Science Education: 0
71. Data Mining: 0
72. Bias: 0
73. Convex Optimization: 0
74. Linear Algebra: 0
75. Geometry: 0
76. Compilers: 0
77. Wireless Technology: 0
78. Concept Bottleneck Model: 0
79. 3D Reconstruction: 0
80. Alzheimer's Disease: 0
81. Transfer Learning: 1
82. Real Time Systems: 0
83. Data Visualization: 0
84. Physics: 0
85. Cyber Physical Systems: 0
86. Physical Substrates: 0
87. Game Theory: 0
88. Antitrust: 0
89. Runtime: 0
90. Chatbots: 0
91. Law: 0
92. Networking: 0
93. Cloud Computing: 0
count 19
1. H

In [None]:
df.to_csv('question_matrices.csv')

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
df_prof = df_prof.fillna(0)

In [None]:
X_train = df_prof.drop(columns=['professor'])
y_train = df_prof['professor']

In [None]:
classifier = RandomForestClassifier(n_estimators = 100, random_state = 42)
classifier.fit(X_train.to_numpy(), y_train)

In [None]:
X_test = df.drop(columns=['question'])

In [None]:
y_pred = classifier.predict(X_test.to_numpy())

In [None]:
final_results = pd.DataFrame()

In [None]:
import numpy as np
final_results['chatbot_answer'] = y_pred
#final_results['golden_standard_answer'] = #np.array(excel['Test/Dev Input'].tolist())

In [None]:
final_results.to_csv('results_random_forest.csv')

## SVC n = 4

In [None]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_train, y_train)

In [None]:
top_n_classes = 4
predictions = clf.decision_function(
                    X_test.to_numpy()).argsort()[:,-top_n_classes:][:,::-1]
pred_df = pd.DataFrame(predictions,
                       columns= [f'{i+1}_pred' for i in range(top_n_classes)])



In [None]:
p = classifier.predict_proba(X_test)
n = 3
top_n = np.argsort(p)[:,:-n-1:-1]



In [None]:
d = {}
for i, j in enumerate(df_prof.professor.to_list()):
  d[i] = j

In [None]:
dataset = pd.DataFrame({'pred_1': top_n[:, 0], 'pred_2': top_n[:, 1], 'pred_3': top_n[:, 2], 'pred_4': top_n[:, 2]})

In [None]:
dataset['chatbot_answer'] = dataset['pred_1'].map(d) + ', ' + dataset['pred_2'].map(d) + ', ' + dataset['pred_3'].map(d) + ', ' + dataset['pred_4'].map(d)

In [None]:
dataset['golden_standard_answer'] = np.array(excel['Expected Response'].tolist())

In [None]:
dataset.to_csv('results_svc.csv')

## KNN

In [None]:
df_prof = df_prof.fillna(0)

In [None]:
X_train = df_prof.drop(columns=['professor'])
y_train = df_prof['professor']

In [None]:
df_prof.head(2)

Unnamed: 0,professor,high_frequency_trading_bool,quantitative_finance,computational_finance_bool,drug_discovery_bool,cancer_bool,biology_bool,radiology_bool,neuroscience_bool,evolution_bool,...,physics_bool,cyber_physical_systems,physical_substrates,game_theory_bool,antitrust_bool,runtime,chatbot,law_bool,networking_bool,cloud_computing_bool
0,Bong Ho Kim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Junhyong Kim,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train)

In [None]:
X_test = df.drop(columns=['question'])

In [None]:
_, ind = neigh.kneighbors(X_test.to_numpy(),  n_neighbors=4)



In [None]:
dataset = pd.DataFrame({'pred_1': ind[:, 0], 'pred_2': ind[:, 1], 'pred_3': ind[:, 2],
                        'pred_4': ind[:, 3],
                        })

In [None]:
dataset['chatbot_answer'] = dataset['pred_1'].map(d) + ', ' + dataset['pred_2'].map(d) + ', ' + dataset['pred_3'].map(d) + ', ' + dataset['pred_4'].map(d)

In [None]:
dataset['golden_standard_answer'] = np.array(excel['Expected Response'].tolist())

In [None]:
dataset.to_csv('results_knn.csv')