In [450]:
import requests

def send_solr_request(q):
    url = "http://localhost:8983/solr/animals/query"
    
    # Define the query parameters
    params = {
        'q': q,
        "defType": "edismax",
        "qf": "Name^2.5 Features^2.0 Fun_Fact^2.0 Diet^2.0 Text^1.5 Features^2.0 Behavior^2.0",
        "pf": "Name^2.5 Features^2.0 Fun_Fact^2.0 Diet^2.0 Text^1.5 Features^2.0 Behavior^2.0",
        "mm": "3<-25%",
        "ps": 5,
        'rq': '{!ltr model=animals_model efi.text=\'natural habitat\'}',
        'fl': 'id,Name,score,[features]',
        "rows": "30"

    }

    try:
        # Send the HTTP GET request
        response = requests.get(url, params=params)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Print the response content
            # print("Response:")
            return (response.json())
        else:
            print(f"Error: {response.status_code} - {response.text}")

    except requests.RequestException as e:
        print(f"Error: {e}")

def parse_solr_response(response_json):
    # Extract the list of animals from the Solr response
    animals = []

    # Check if 'response' and 'docs' keys exist in the response
    if 'response' in response_json and 'docs' in response_json['response']:
        for doc in response_json['response']['docs']:
            # Check if 'Name' and 'score' keys exist in the document
            if 'Name' in doc and 'score' in doc:
                animal = {
                    'Name': doc['Name'],
                    '[features]': doc['[features]']
                }
                animals.append(animal)

    return animals

res = send_solr_request('natural habitat')

print (parse_solr_response(res))

[{'Name': 'Kiko Goat', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=1.675411,queryMatchMigratory=0.0,queryMatchText=0.47909436,originalScore=530.3125'}, {'Name': 'Grouse', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=0.0,queryMatchMigratory=0.0,queryMatchText=0.48193377,originalScore=8.499506'}, {'Name': 'Sidewinder', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=0.0,queryMatchMigratory=0.0,queryMatchText=0.47588998,originalScore=8.481009'}, {'Name': 'Mouse Deer Chevrotain', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatchOrigin=0.0,queryMatchFun_Fact=1.6180977,queryMatchMigratory=0.0,queryMatchText=0.47838327,originalScore=8.472482'}, {'Name': 'Tiger Swallowtail', '[features]': 'queryMatchName=0.0,queryMatchGenus=0.0,queryMatchClass=0.0,queryMatch

In [451]:
import pandas as pd
import json as simplejson

queries = ["Energetic dog breeds suited for hunting",
"North America animals that like to eat insects",
"Change the color of their skin, fur or feathers for the purpose of camouflage",
"Animals that walk in hierarchical groups or herds and how they deal with territory", "(NOT Birds) migrate to Mexico or migrate to America"]

for idx, q in enumerate(queries):
    res = send_solr_request(q)
    animals = parse_solr_response(res)

    df = pd.DataFrame(animals)
    # df.to_csv('queries/query{0}_results.csv'.format(idx+1), index=False)

    # drop every column except the Name 
    # df = df[df.columns[0:1]]

    df['score'] = 0.0

    # save the query results to a csv file
    # df.to_csv('queries/query{0}_score.csv'.format(idx+1), index=False)


### Criteria
- 0 - A document that does not match the query.
- 1 - A document that vaguely matches the query, is very incomplete (missing important fields, like instructions) and has no reviews. Or has very negative reviews.
- 2 - A document that partially matches the query, is incomplete. 
- 3 - A document that matches the query semantically, is reasonably complete (may miss more than two fields) and has at least one positive review.
- 4 - A document that perfectly or almost perfectly matches the query semantically, is complete or missing just one of the fields.
- 5 - A document that perfectly matches the query semantically, is complete (the recipe has a full ingredient list, steps and cook time/nutritional information).




In [452]:
## linear model using svm


import pandas as pd
import glob

result_files = glob.glob("queries/*_results_done.csv")
scores_files = glob.glob("queries/*_score_done.csv")

result_files.sort()
scores_files.sort()

inputs = pd.concat((pd.read_csv(file) for file in result_files), ignore_index=True)
scores = pd.concat((pd.read_csv(file) for file in scores_files), ignore_index=True)


#merge results_files and scores_files into a single data frame called ltr
for ind, pair in enumerate(zip(result_files, scores_files)):
    result = pd.read_csv(pair[0])
    score = pd.read_csv(pair[1])
    result['score'] = score['score']
    result['Name'] = score['Name']
    result.to_csv('queries/query{0}_ltr.csv'.format(ind+1), index=False)

X = []
Y = [entry.score for entry in scores.itertuples()]


In [453]:
def get_features(entry):
    return [float(feature.split("=")[1]) for feature in entry._2.split(",")]


for entry in inputs.itertuples():

    X.append(get_features(entry))


In [454]:


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm, linear_model
from sklearn.metrics import r2_score

scaler = StandardScaler()

scaler.fit(X)
X = scaler.fit_transform(X)

(train_x,
 test_x,
 train_y,
 test_y) = train_test_split(X, Y, test_size=0.10, random_state=0)

 
scaler.fit(X)
X = scaler.fit_transform(X)
best_random_state = None
best_r2_score = -1

# for random_state in range(0, 100):
#     # Split the data using the current random state
#     train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.20, random_state=random_state)

#     # Perform any necessary preprocessing steps
#     scaler = StandardScaler()
#     scaler.fit(train_x)
#     train_x = scaler.transform(train_x)
#     test_x = scaler.transform(test_x)

#     # Train your model
#     model =  svm.LinearSVR()  # Replace YourModel with the actual model you are using
#     model.fit(train_x, train_y)

#     # Make predictions on the test set
#     predictions = model.predict(test_x)

#     # Evaluate the model using a suitable metric (e.g., R-squared score)
#     r2 = r2_score(test_y, predictions)

#     # Update the best random state if necessary
#     if r2 > best_r2_score:
#         best_r2_score = r2
#         best_random_state = random_state

# print("Best Random State:", best_random_state)
# print("Best R-squared Score:", best_r2_score)



In [455]:
from sklearn import svm, linear_model
from sklearn.metrics import r2_score

linearSVM = svm.LinearSVR()
lienarReg = linear_model.LinearRegression()

linearSVM.fit(train_x, train_y)
lienarReg.fit(train_x, train_y)

pred_svm = linearSVM.predict(test_x)
pred_reg = lienarReg.predict(test_x)

r2_score(test_y, pred_svm)

-0.08454806320262342

In [456]:
# fit with all data
# linearSVM = svm.LinearSVR()
# lienarReg = linear_model.LinearRegression()

# linearSVM.fit(X, Y)
# lienarReg.fit(X, Y)


In [457]:
## print the scores of the features in the model
print(linearSVM.coef_)


[-0.0148142   0.          0.          0.         -0.01010227  0.
  0.03712187  0.21099972]


In [458]:
print(lienarReg.coef_)

## score of linear regression
r2_score(test_y, pred_reg)

[-4.08421857e-03 -7.49400542e-16  1.12757026e-16  0.00000000e+00
 -8.19807676e-02  0.00000000e+00 -1.35990977e-01  2.10821428e-02]


-0.06585008112627277