In [110]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPool1D
from keras.layers import Bidirectional, Dropout

# 1. Weak Supervision Labeling
def weak_supervision_label(question):
    if not isinstance(question, str):
        return [0.33, 0.33, 0.33]  # Default uncertain label for non-string input

    question = question.lower()
    
    # Define similar question words for each category
    what_like = ["what", "which", "who", "where", "when"]
    how_like = ["how", "in what way", "by what means"]
    why_like = ["why", "for what reason", "how come"]
    
    # Initialize the label
    label = np.array([0.0, 0.0, 0.0])
    
    # Define weights for the word position in the question
    weight_start = 0.5
    weight_middle = 1.0
    weight_end = 2.0
    
    # Helper function to apply weights based on position
    def apply_weights(word_list, weight):
        nonlocal label
        for word in word_list:
            if word in question:
                if question.endswith(word):
                    label += np.array([weight_end if word in what_like else 0, 
                                        weight_end if word in how_like else 0, 
                                        weight_end if word in why_like else 0])
                elif question.startswith(word):
                    label += np.array([weight_start if word in what_like else 0, 
                                        weight_start if word in how_like else 0, 
                                        weight_start if word in why_like else 0])
                else: 
                    label += np.array([weight_middle if word in what_like else 0, 
                                        weight_middle if word in how_like else 0, 
                                        weight_middle if word in why_like else 0])
    
    # Apply the weights for each category
    apply_weights(what_like, weight_middle)
    apply_weights(how_like, weight_middle)
    apply_weights(why_like, weight_middle)
    
    # Normalize the label to sum to 1
    if np.sum(label) > 0:
        label /= np.sum(label)
    else:
        label = np.array([0.33, 0.33, 0.33])  # Uncertain label
    
    return label.tolist()

# Example usage
question = "How does this work?"
print(weak_supervision_label(question))


[0.0, 1.0, 0.0]


In [111]:
import pandas as pd

In [112]:
data_csv = pd.read_csv('/Users/lancesanterre/intern_2024/data/uncleaned/q_quora.csv')
data_csv['question1'].shape

  data_csv = pd.read_csv('/Users/lancesanterre/intern_2024/data/uncleaned/q_quora.csv')


(404351,)

In [114]:
data = pd.read_json('data/uncleaned/dev-v1.1.json')

In [115]:
questions = []
for entry in data["data"]:
    for paragraph in entry["paragraphs"]:
        for qa in paragraph["qas"]:
            questions.append(qa["question"])
for quest in data_csv['question1']:
    questions.append(quest)


In [116]:
labels = list(map(weak_supervision_label, questions))


In [117]:
df = pd.DataFrame(questions)
df

Unnamed: 0,0
0,Which NFL team represented the AFC at Super Bo...
1,Which NFL team represented the NFC at Super Bo...
2,Where did Super Bowl 50 take place?
3,Which NFL team won Super Bowl 50?
4,What color was used to emphasize the 50th anni...
...,...
414916,How many keywords are there in the Racket prog...
414917,Do you believe there is life after death?
414918,What is one coin?
414919,What is the approx annual cost of living while...


In [118]:
df.insert(1,'labels',labels)
df.head(5)

Unnamed: 0,0,labels
0,Which NFL team represented the AFC at Super Bo...,"[1.0, 0.0, 0.0]"
1,Which NFL team represented the NFC at Super Bo...,"[1.0, 0.0, 0.0]"
2,Where did Super Bowl 50 take place?,"[1.0, 0.0, 0.0]"
3,Which NFL team won Super Bowl 50?,"[1.0, 0.0, 0.0]"
4,What color was used to emphasize the 50th anni...,"[1.0, 0.0, 0.0]"


In [119]:
def filter(df):
    # Convert the DataFrame to a NumPy array
    data_array = df.to_numpy()

    # Initialize lists to store rows based on the filter
    equal_to_33 = []
    not_equal_to_33 = []

    # Iterate over the rows in the array
    for row in data_array:
        # Check the condition in the 'labels' column (assumed to be the last column)
        labels = row[-1]  # Assuming 'labels' is the last column
        if all(label == 0.33 for label in labels):
            equal_to_33.append(row)
        else:
            not_equal_to_33.append(row)

    # Convert the lists back to DataFrames
    df_equal_to_33 = pd.DataFrame(equal_to_33, columns=df.columns)
    df_not_equal_to_33 = pd.DataFrame(not_equal_to_33, columns=df.columns)

    return df_equal_to_33, df_not_equal_to_33

In [120]:
unsure, sure = filter(df)

In [127]:
df.to_pickle('data.pkl', compression='gzip')


In [121]:
filtered_questions = sure[0]
filtered_labels = sure['labels']

In [122]:
print(filtered_questions.info(),filtered_labels.info())


<class 'pandas.core.series.Series'>
RangeIndex: 347058 entries, 0 to 347057
Series name: 0
Non-Null Count   Dtype 
--------------   ----- 
347058 non-null  object
dtypes: object(1)
memory usage: 2.6+ MB
<class 'pandas.core.series.Series'>
RangeIndex: 347058 entries, 0 to 347057
Series name: labels
Non-Null Count   Dtype 
--------------   ----- 
347058 non-null  object
dtypes: object(1)
memory usage: 2.6+ MB
None None


In [128]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(filtered_questions)
sequences = tokenizer.texts_to_sequences(filtered_questions)
X = pad_sequences(sequences, maxlen=10)

# Convert labels to numpy array
y = np.array(filtered_labels.tolist())  # Convert Series of lists to a NumPy array

from keras.layers import BatchNormalization

model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=10),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.5),
    BatchNormalization(),
    LSTM(64, return_sequences=True),
    GlobalMaxPool1D(),
    Dense(64, kernel_regularizer='l2', activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])
# 3. Compile the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Model Training
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# 6. Evaluate Model Performance
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

Epoch 1/10




[1m 147/8677[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:25[0m 17ms/step - accuracy: 0.7511 - loss: 1.1055

KeyboardInterrupt: 

In [124]:
ambiguous_questions = unsure[0]

In [126]:

if len(ambiguous_questions) == 0:
    print("No ambiguous questions found.")
else: 
    ambiguous_sequences = tokenizer.texts_to_sequences(ambiguous_questions)
    ambiguous_X = pad_sequences(ambiguous_sequences, maxlen=10)

    # Perform prediction only if ambiguous_X is not empty
    if ambiguous_X.size > 0:
        predictions = model.predict(ambiguous_X)
        for question, prediction in zip(ambiguous_questions, predictions):
            print(f"Question: {question}")
            print(f"Predicted Label: {np.round(prediction, 2)}")
    else:
        print("No valid sequences for prediction.")

AttributeError: 'float' object has no attribute 'lower'

In [86]:

def preprocess_question(question, tokenizer, maxlen=10):
    # Tokenize the question
    sequence = tokenizer.texts_to_sequences([question])
    # Pad the sequence to ensure it has the correct length
    padded_sequence = pad_sequences(sequence, maxlen=maxlen)
    return padded_sequence

def predict_question(question, model, tokenizer, maxlen=10):
    # Preprocess the question
    processed_question = preprocess_question(question, tokenizer, maxlen)
    # Predict the category
    prediction = model.predict(processed_question)
    # Return the prediction
    return prediction

# Example question
example_question = "Explain the process of photosynthesis in plants."
prediction = predict_question(example_question, model, tokenizer)

# Print the prediction
print(np.round(prediction, 2))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[[1. 0. 0.]]
