# Load Libraries

In [12]:
%%capture
!pip3 install weaviate-client transformers torch

# Create Spark Session

In [2]:
from pyspark.sql import SparkSession
import os
import weaviate
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql import SparkSession
from transformers import AutoTokenizer, AutoModel
import torch
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType


client = weaviate.Client("http://weaviate:8080")


spark = (
    SparkSession.builder.config(
        "spark.jars.packages",
        "io.weaviate:spark-connector_2.12:1.3.2", 
    )
    .master("local[*]")
    .appName("weaviate")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


# Load Model and Generate Vectors

In [3]:
def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModel.from_pretrained("distilbert-base-uncased")
    return tokenizer, model
def generate_vectors(description):
    tokens = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten().tolist()
    
tokenizer, model = load_model_and_tokenizer()



tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Load Data

In [4]:
df = spark.read.json("/home/jovyan/work/course.json")

In [5]:
df.printSchema()

root
 |-- attendancePolicy: string (nullable = true)
 |-- classSchedule: string (nullable = true)
 |-- courseDescription: string (nullable = true)
 |-- courseFacultyName: string (nullable = true)
 |-- courseFacultyOfficeHours: string (nullable = true)
 |-- courseName: string (nullable = true)
 |-- courseOfficeHours: string (nullable = true)
 |-- dueDates: string (nullable = true)
 |-- learningOutcomes: string (nullable = true)
 |-- prerequisites: string (nullable = true)
 |-- requiredTexts: string (nullable = true)



# Generate Vector for courseDescription

In [6]:
df.show()

+--------------------+--------------------+--------------------+-----------------+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    attendancePolicy|       classSchedule|   courseDescription|courseFacultyName|courseFacultyOfficeHours|          courseName|   courseOfficeHours|            dueDates|    learningOutcomes|       prerequisites|       requiredTexts|
+--------------------+--------------------+--------------------+-----------------+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Attendance is not...|Tuesday Class: Hi...|A broad introduct...|       Akit Kumar|    TBD – See Blackbo...|IST-718 Big Data ...|TBD – See Blackbo...|Assignment dates ...|Translate a busin...|Familiarity with ...|Python Data Scien...|
|Attendance in cla...|Mondays and Wedne...|Knowledge represe...|

In [7]:
vector_udf = udf(generate_vectors, ArrayType(FloatType()))
df_vec = df.withColumn("vector", vector_udf(df['courseDescription']))
df_vec.printSchema()

root
 |-- attendancePolicy: string (nullable = true)
 |-- classSchedule: string (nullable = true)
 |-- courseDescription: string (nullable = true)
 |-- courseFacultyName: string (nullable = true)
 |-- courseFacultyOfficeHours: string (nullable = true)
 |-- courseName: string (nullable = true)
 |-- courseOfficeHours: string (nullable = true)
 |-- dueDates: string (nullable = true)
 |-- learningOutcomes: string (nullable = true)
 |-- prerequisites: string (nullable = true)
 |-- requiredTexts: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: float (containsNull = true)



# Pushing Custom Schema to Weaviate

In [None]:
# First, delete all existing classes in the schema
client.schema.delete_all()

# Now, create a new class for "Course"
client.schema.create_class(
    {
        "class": "Course",
        "description": "A class representing the courses including their descriptions, schedules, and other related information.",
        "properties": [
            {
                "name": "courseName",
                "dataType": ["string"],
                "description": "The name of the course."
            },
            {
                "name": "courseDescription",
                "dataType": ["string"],
                "description": "A detailed description of what the course covers."
            },
            {
                "name": "courseFacultyName",
                "dataType": ["string"],
                "description": "Name of the faculty teaching the course."
            },
            {
                "name": "courseFacultyOfficeHours",
                "dataType": ["string"],
                "description": "Scheduled office hours for the course faculty."
            },
            {
                "name": "courseOfficeHours",
                "dataType": ["string"],
                "description": "General office hours related to the course."
            },
            {
                "name": "requiredTexts",
                "dataType": ["string"],
                "description": "List of required texts for the course."
            },
            {
                "name": "learningOutcomes",
                "dataType": ["string"],
                "description": "Expected learning outcomes for the students."
            },
            {
                "name": "attendancePolicy",
                "dataType": ["string"],
                "description": "Policy regarding attendance in the course."
            },
            {
                "name": "classSchedule",
                "dataType": ["string"],
                "description": "Schedule of classes."
            },
            {
                "name": "dueDates",
                "dataType": ["string"],
                "description": "Important dates and deadlines for course assignments."
            },
            {
                "name": "prerequisites",
                "dataType": ["string"],
                "description": "Prerequisites required for enrolling in the course."
            }
        ]
    }
)


# Pushing Data to Weaviate

In [9]:
df_vec.write.format("io.weaviate.spark.Weaviate") \
    .option("scheme", "http") \
    .option("host", "weaviate:8080") \
    .option("className", "Course") \
    .option("vector", "vector") \
    .mode("append").save()

# Query Data from Weaviate

In [10]:
question="automated reasoning, computer vision, and natural language processing"
q_v = {"vector":generate_vectors(question)}

In [11]:
import json
response = (
    client.query
    .get("Course", ["courseName", "courseDescription", "courseFacultyName"])  # Ensure these fields exist in your schema
    .with_near_vector(q_v)
    .with_limit(1)  # Adjust limit as needed
    .do()
)

# Print the response formatted as JSON for readability
print(json.dumps(response, indent=4))



{
    "data": {
        "Get": {
            "Course": [
                {
                    "courseDescription": "Knowledge representation, production systems, search algorithms, game playing, uncertainty handling, learning, automated reasoning, computer vision, and natural language processing. Programming project or term paper required for CIS 667, not for CIS 467. The course aims to prepare students to work and live in a world increasingly influenced by artificial intelligence.",
                    "courseFacultyName": "Garrett Katz",
                    "courseName": "CIS 467/667: Introduction to Artificial Intelligence"
                }
            ]
        }
    }
}
